def reconstruct(opts_path): """reconstruct from channel data """ opts = loadOptions(opts_path) # normalize paths according to the platform opts['extra']['src_dir'] =\ os.path.expanduser(os.path.normpath(opts['extra']['src_dir'])) opts['extra']['dest_dir'] =\ os.path.expanduser(os.path.normpath(opts['extra']['dest_dir'])) # load data from hdf5 files ind = opts['load']['EXP_START'] if opts['load']['EXP_END'] != -1 and\ opts['load']['EXP_END'] != ind: notifyCli('WARNING: multiple experiments selected. ' 'Only the first dataset will be processed') chn_data, chn_data_3d = load_hdf5_data( opts['extra']['dest_dir'], ind) if opts['unpack']['Show_Image'] != 0: notifyCli('Currently only Show_Image = 0 is supported.') # initialize pyCuda environment cuda.init() dev = cuda.Device(0) ctx = dev.make_context() reImg = reconstruction_3d(chn_data_3d, opts['recon']) ctx.pop() del ctx save_reconstructed_image(reImg, opts['extra']['dest_dir'], ind, 'tiff', '_3d')
def gpuFunc(iterator): # 1. Data preparation iterator = iter(iterator) cpu_data = list(iterator) cpu_dataset = " ".join(cpu_data) ascii_data = np.asarray([ord(x) for x in cpu_dataset], dtype=np.uint8) # 2. Driver initialization and data transfer cuda.init() dev = cuda.Device(0) contx = dev.make_context() gpu_dataset = gpuarray.to_gpu(ascii_data) # 3. GPU kernel. # The kernel's algorithm counts the words by keeping # track of the space between them countkrnl = reduction.ReductionKernel(long, neutral = "0", map_expr = "(a[i] == 32)*(b[i] != 32)", reduce_expr = "a + b", arguments = "char *a, char *b") results = countkrnl(gpu_dataset[:-1],gpu_dataset[1:]).get() yield results # Release GPU context resources contx.pop() del gpu_dataset del contx gc.collect()
def worker(): comm = MPI.Comm.Get_parent() size = comm.Get_size() rank = comm.Get_rank() name = MPI.Get_processor_name() import pycuda.driver as drv drv.init() # Find maximum number of available GPUs: max_gpus = drv.Device.count() # Use modular arithmetic to avoid assigning a nonexistent GPU: n = rank % max_gpus dev = drv.Device(n) ctx = dev.make_context() atexit.register(ctx.pop) # Execute a kernel: import pycuda.gpuarray as gpuarray from pycuda.elementwise import ElementwiseKernel kernel = ElementwiseKernel('double *y, double *x, double a', 'y[i] = a*x[i]') x_gpu = gpuarray.to_gpu(np.random.rand(2)) y_gpu = gpuarray.empty_like(x_gpu) kernel(y_gpu, x_gpu, np.double(2.0)) print 'I am process %d of %d on CPU %s using GPU %s of %s [x_gpu=%s, y_gpu=%s]' % \ (rank, size, name, n, max_gpus, str(x_gpu.get()), str(y_gpu.get())) comm.Disconnect()
def init_device(device='gpu0'): if device.startswith('cuda'): import os if 'THEANO_FLAGS' in os.environ: raise ValueError('Use theanorc to set the theano config') os.environ['THEANO_FLAGS'] = 'device={0}'.format(device) import theano.gpuarray # This is a bit of black magic that may stop working in future # theano releases ctx = theano.gpuarray.type.get_context(None) drv = None elif device.startswith('gpu'): gpuid = int(device[-1]) import pycuda.driver as drv drv.init() dev = drv.Device(gpuid) ctx = dev.make_context() import theano.sandbox.cuda theano.sandbox.cuda.use(device) import theano else: drv=None ctx=None import theano.sandbox.cuda theano.sandbox.cuda.use(device) import theano from theano import function, config, shared, sandbox, tensor vlen = 10 * 30 * 768 # 10 x #cores x # threads per core iters = 1000 rng = np.random.RandomState(22) arr = rng.rand(vlen) shared_x = theano.shared(np.asarray(arr, config.floatX)) shared_xx = theano.shared(np.asarray(arr, config.floatX)) x=tensor.fvector("x") # compile a function so that shared_x will be set to part of a computing graph on GPU (CUDAndarray) f = function([], tensor.exp(x), givens=[(x,shared_x)]) if np.any([isinstance(x.op, tensor.Elemwise) and ('Gpu' not in type(x.op).__name__) for x in f.maker.fgraph.toposort()]): print('Used the cpu') else: print('Used the gpu') # if np.any([isinstance(x.op, tensor.Elemwise) for x in f.maker.fgraph.toposort()]) and device!='cpu': # raise TypeError('graph not compiled on GPU') return drv,ctx, arr, shared_x, shared_xx
def fun_load(config, sock_data=5000): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror flag_randproc = not config['use_data_layer'] flag_batch = config['batch_crop_mirror'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind('tcp://*:{0}'.format(sock_data)) shape, dtype, h = sock.recv_pyobj() print 'shared_x information received', shape, dtype shape = (3, 255, 255, 256) # TODO remove fix gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: # getting the hkl file name to load hkl_name = recv_queue.get() # print hkl_name #data = pickle.load(open(hkl_name)) - img_mean data = hkl.load(hkl_name) - img_mean # print 'load ', time.time() - bgn_time if flag_randproc: param_rand = recv_queue.get() data = crop_and_mirror(data, param_rand, flag_batch=flag_batch) gpu_data.set(data) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx) ctx.synchronize() send_queue.put('copy_finished')
def _init_gpu(comm): """ Chooses a gpu and creates a context on it. """ # Find out how many GPUs are available to us on this node. driver.init() num_gpus = driver.Device.count() # Figure out the names of the other hosts. rank = comm.Get_rank() # Find out which process I am. name = MPI.Get_processor_name() # The name of my node. hosts = comm.allgather(name) # Get the names of all the other hosts # Find out which GPU to take (by precedence). gpu_id = hosts[0:rank].count(name) if gpu_id >= num_gpus: raise TypeError("No GPU available.") # Create a context on the appropriate device. for k in range(num_gpus): try: device = driver.Device((gpu_id + k) % num_gpus) context = device.make_context() except: continue else: # print "On %s: process %d taking gpu %d of %d.\n" % \ # (name, rank, gpu_id+k, num_gpus) break return device, context # Return device and context.
def choose_gpu(): # Find out how many GPUs are available to us on this node. drv.init() num_gpus = drv.Device.count() # Figure out the names of the other hosts. rank = MPI.COMM_WORLD.Get_rank() # Find out which process I am. name = MPI.Get_processor_name() # The name of my node. hosts = MPI.COMM_WORLD.allgather(name) # Get the names of all the other hosts # Figure out our precendence on this node. # Make sure the number of hosts and processes are equal. num_processes = MPI.COMM_WORLD.Get_size() if (len(hosts) is not num_processes): raise TypeError('Number of hosts and number of processes do not match.') # Make sure the name of my node matches. if (name != hosts[rank]): # print name, hosts[rank] raise TypeError('Hostname does not match.') # Find out which GPU to take. gpu_id = hosts[0:rank].count(name) if gpu_id >= num_gpus: raise TypeError('No GPU available.') # sys.stdout.write("On %s: %d/%d taking gpu %d/%d.\n" % \ # (name, rank, num_processes, gpu_id, num_gpus)) # Make and return a context on the device. return drv.Device(gpu_id).make_context()
def __init__(self, options, gpu_id): """Initializes the CUDA backend. :param options: LBConfig object :param gpu_id: number of the GPU to use """ cuda.init() self.buffers = {} self.arrays = {} self._kern_stats = set() self.options = options self._device = cuda.Device(gpu_id) self._ctx = self._device.make_context( flags=cuda.ctx_flags.SCHED_AUTO if not options.cuda_sched_yield else cuda.ctx_flags.SCHED_YIELD) if (options.precision == 'double' and self._device.compute_capability()[0] >= 3): if hasattr(self._ctx, 'set_shared_config'): self._ctx.set_shared_config(cuda.shared_config.EIGHT_BYTE_BANK_SIZE) # To keep track of allocated memory. self._total_memory_bytes = 0 self._iteration_kernels = []
def n_blocks(self): n_blocks = self.opts.get('n_blocks') if n_blocks is None: default_threads_per_block = 32 bytes_per_float = 4 memory_per_thread = (self._len_species + 1) * bytes_per_float if cuda is None: threads_per_block = default_threads_per_block else: cuda.init() device = cuda.Device(self.gpu[0]) attrs = device.get_attributes() shared_memory_per_block = attrs[ cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK] upper_limit_threads_per_block = attrs[ cuda.device_attribute.MAX_THREADS_PER_BLOCK] max_threads_per_block = min( shared_memory_per_block / memory_per_thread, upper_limit_threads_per_block) threads_per_block = min(max_threads_per_block, default_threads_per_block) n_blocks = int( np.ceil(1. * len(self.param_values) / threads_per_block)) self._logger.debug('n_blocks set to {} (used pycuda: {})'.format( n_blocks, cuda is not None )) self.n_blocks = n_blocks return n_blocks
def _init_gpu(self): """ Initialize GPU device. Notes ----- Must be called from within the `run()` method, not from within `__init__()`. """ if self.device == None: self.log_info('no GPU specified - not initializing ') else: # Import pycuda.driver here so as to facilitate the # subclassing of Module to create pure Python LPUs that don't use GPUs: import pycuda.driver as drv drv.init() try: self.gpu_ctx = drv.Device(self.device).make_context() except Exception as e: self.log_info('_init_gpu exception: ' + e.message) else: atexit.register(self.gpu_ctx.pop) self.log_info('GPU initialized')
def __init__(self, device_num=0, sync_calls=False): cuda.init() #self.context = pycuda.tools.make_default_context() #self.device = self.context.get_device() self.device = cuda.Device(device_num) self.context = self.device.make_context() self.stream = cuda.Stream() self.max_block_size = self.device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_X) self.max_grid_size_x = self.device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_X) self.max_grid_size_y = self.device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_Y) self.max_grid_size_x_pow2 = 2 ** log2(self.max_grid_size_x) self.max_registers = self.device.get_attribute(cuda.device_attribute.MAX_REGISTERS_PER_BLOCK) self.warp_size = self.device.get_attribute(cuda.device_attribute.WARP_SIZE) self.gpu = True self.cuda = True self._sync_calls = sync_calls self.allocated = 0
def run_kernel_on_gpus(self, vec_a, vec_b): drv.init() num = drv.Device.count() num = 1 vector_len = vec_b.shape[0] sections = range(0, vector_len, vector_len / num) sections = sections[1:] print "section on gpus:" print sections sub_vec_bs = numpy.split(vec_b, sections) gpu_thread_list = [] for i in range(num): gpu_thread = GPUThread(i, vec_a, sub_vec_bs[i], self.block, self.grid) gpu_thread.start() gpu_thread_list.append(gpu_thread) dest = numpy.array([]) for gpu in gpu_thread_list: gpu.join() dest = numpy.concatenate((dest, gpu.vec_b)) print dest return dest
def __init__(self, shape, dtype=numpy.float32, stream=None, allocator=drv.mem_alloc,cuda_device=0): try: drv.init() ctx = drv.Device(0).make_context() except RuntimeError: "device is already initialized! so we ignore this ugly, but works for now" #which device are we working on self.cuda_device = cuda_device #internal shape self.shape = shape #internal type self.dtype = numpy.dtype(dtype) from pytools import product #internal size self.size = product(shape) self.allocator = allocator if self.size: self.gpudata = self.allocator(self.size * self.dtype.itemsize) else: self.gpudata = None self.stream = stream self._update_kernel_kwargs()
def __init__(self, device_number=0, thread_per_block=512, **kwargs): self.device_number = device_number self.thread_per_block = thread_per_block self.device_type = 'nvidia_gpu' self.language = 'cuda' self.code_type = 'cu' try: import pycuda.driver as cuda cuda.init() except Exception as e: logger.error("Error: CUDA initialization error", exc_info=True) raise SystemExit max_devices = cuda.Device.count() if max_devices == 0: logger.error("Error: There is no CUDA device (NVIDIA GPU).") raise SystemExit elif device_number >= max_devices: logger.error("Error: The given device_number(%d) is bigger than physical GPU devices(%d)."%(device_number, max_devices)) raise SystemExit else: device = cuda.Device(device_number) context = device.make_context() import atexit atexit.register(context.pop) self.cuda = cuda self.device = device self.context = context
def _init_gpu(self): """ Initialize GPU device. Notes ----- Must be called from within the `run()` method, not from within `__init__()`. """ if self.device == None: self.log_info('no GPU specified - not initializing ') else: # Import pycuda.driver here so as to facilitate the # subclassing of Module to create pure Python LPUs that don't use GPUs: import pycuda.driver as drv drv.init() N_gpu = drv.Device.count() if not self.device < N_gpu: new_device = randint(0,N_gpu - 1) self.log_warning("GPU device device %d not in GPU devices %s" % (self.device, str(range(0,N_gpu)))) self.log_warning("Setting device = %d" % new_device) self.device = new_device try: self.gpu_ctx = drv.Device(self.device).make_context() except Exception as e: self.log_info('_init_gpu exception: ' + e.message) else: atexit.register(self.gpu_ctx.pop) self.log_info('GPU %s initialized' % self.device)
def get_device_count(verbose=False): """ Query device count through PyCuda. Arguments: verbose (bool): prints verbose logging if True, default False. Returns: int: Number of GPUs available. """ try: import pycuda import pycuda.driver as drv except ImportError: if verbose: print("PyCUDA module not found") return 0 try: drv.init() except pycuda._driver.RuntimeError as e: print("PyCUDA Runtime error: {0}".format(str(e))) return 0 count = drv.Device.count() if verbose: print "Found %d GPU(s)", count return count
def test_vector_add(): #Check pycuda is installed and if a CUDA capable device is present, if not skip the test try: import pycuda.driver as drv drv.init() except (ImportError, Exception): pytest.skip("PyCuda not installed or no CUDA device detected") kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 10000000 problem_size = (size, 1) a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] params = {"block_size_x": 512} answer = run_kernel("vector_add", kernel_string, problem_size, args, params) assert numpy.allclose(answer[0], a+b, atol=1e-8)
def _init_gpu(self): """ Initialize gpu context """ self.logger.info("starting cuda") cuda.init() dev = cuda.Device( self.gpu_id ) self.ctx = dev.make_context()
def get_num_gpus(): """Returns the number of GPUs available""" print ("Determining number of GPUs...") from pycuda import driver driver.init() num_gpus = driver.Device.count() print ("Number of GPUs: {}".format(num_gpus)) return num_gpus
def tensorrt_init(self, *args, **kwargs): from tensorrt.lite import Engine import pycuda.driver as cuda cuda.init() args[1].cuda_context = cuda.Device(0).make_context() args[0].logger.info('Loading TensorRT engine: %s' % self.engine_file) args[1].trt_engine = Engine(PLAN=self.engine_file) cuda.Context.pop()
def init(): # MAGIC MAGIC import pycuda.driver as cuda cuda.init() from pycuda.tools import make_default_context context = make_default_context() device = context.get_device() import atexit atexit.register(context.detach)
def __init__(s, ndim, ns, ds, dt, h5save=False, **kwargs): s.dt = dt s.ndim = ndim if s.ndim == 1: s.nx = s.ns = ns s.dx = s.ds = ds elif s.ndim == 2: s.nx, s.ny = s.ns = ns if ds == list: s.dx, s.dy = s.ds = ds else: s.dx = s.dy = s.ds = ds s.h5save = h5save # sub area s.snx = 1024 s.sx0 = s.nx / 2 - s.snx / 2 s.sx1 = s.nx / 2 + s.snx / 2 if ndim == 2: s.sny = 1024 s.sy0 = s.ny / 2 - s.sny / 2 s.sy1 = s.ny / 2 + s.sny / 2 # dk s.dkx = 1.0 / (s.nx * s.dx) * 2 * np.pi if ndim == 2: s.dky = 1.0 / (s.ny * s.dy) * 2 * np.pi # array allocations s.psi = np.zeros(s.ns, dtype=np.complex64) s.x = np.arange(s.nx, dtype=np.float32) * s.dx s.kx = np.fft.fftfreq(s.nx, s.dx) * 2 * np.pi s.lcx = np.zeros(s.nx, dtype=np.complex64) s.lcx_sqrt = np.zeros(s.nx, dtype=np.complex64) s.lcx[:] = np.exp(-0.5j * s.kx ** 2 * dt) s.lcx_sqrt[:] = np.sqrt(s.lcx) if ndim == 2: s.y = np.arange(s.ny, dtype=np.float32) * s.dy s.ky = np.fft.fftfreq(s.ny, s.dy) * 2 * np.pi s.lcy = np.zeros(s.ny, dtype=np.complex64) s.lcy_sqrt = np.zeros(s.ny, dtype=np.complex64) s.lcy[:] = np.exp(-0.5j * s.ky ** 2 * dt) s.lcy_sqrt[:] = np.sqrt(s.lcy) # cuda cuda.init() s.ctx = cuda.Device(0).make_context() s.strm = cuda.Stream() s.plan = Plan(s.ns, dtype=np.complex64, context=s.ctx, stream=s.strm) s.psi_gpu = gpuarray.to_gpu(s.psi) if s.ndim == 2: s.lcy_gpu = gpuarray.to_gpu(s.lcy) s.tpb = 256 s.bpg = 30 * 4 print "tpb = %d, bpg = %g" % (s.tpb, s.bpg)
def __init__(self, ne, p_degree, cfl=0.1, v=0.5, target_gpu=0): cuda.init() self.dev = cuda.Device(target_gpu) self.ctx = self.dev.make_context() import atexit atexit.register(self.ctx.pop) super(DGModalGpu, self).__init__(ne, p_degree, cfl, v)
def __init__(self): import pycuda.driver as cuda self.cuda = cuda cuda.init() self.ctxs = [cuda.Device(i).make_context() for i in range(cuda.Device.count())] for ctx in self.ctxs: ctx.pop() self.arrays = {}
def init_all_devices(): global DEVICES, DEVICE_INFO if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] DEVICE_INFO = {} log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() if ngpus==0: log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT) return DEVICES da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None devinfo = "gpu %i" % i try: device = driver.Device(i) devinfo = device_info(device) log(" + testing device %s: %s", i, devinfo) DEVICE_INFO[i] = devinfo host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", devinfo) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) try: log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) if i==0: #we print the list info "header" from inside the loop #so that the log output is bunched up together log.info("CUDA %s / PyCUDA %s, found %s device%s:", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus)) DEVICES.append(i) log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100*free/total, SMmajor, SMminor) finally: context.pop() except Exception as e: log.error("error on device %s: %s", devinfo, e) return DEVICES
def __init__(self, device=0): drv.init() self.device = drv.Device(device) self.context = self.device.make_context() self.memory_pool = pycuda.tools.DeviceMemoryPool() #init fft object self.fft = FFT(self)
def init(device=None): """Initializes CUDA global state. Chainer maintains CUDA context, CUBLAS context, random number generator and device memory pool for each GPU device and for each process (the main process or a process forked by :mod:`multiprocessing`) as global states. When called for the first time on the process, this function initializes these global states. .. warning:: This function also initializes PyCUDA and scikits.cuda. Since these packages do not support forking after initialization, do not call this function before forking the process. This function also registers :func:`shutdown` to :mod:`atexit` slot. It also initializes random number generator. User can set fixed seed with ``CHAINER_SEED`` environment variable. Args: device (``int`` or :class:`~pycuda.driver.Device` or ``None``): Device ID to initialize on. """ global _contexts, _cublas_handles, _generators, _pid, _pools if not available: global _import_error raise RuntimeError( 'CUDA environment is not correctly set up. ' + 'The original import error said: ' + str(_import_error)) pid = os.getpid() if _pid == pid: # already initialized return drv.init() if device is None: # use default device context = cutools.make_default_context() device = Context.get_device() else: device = Device(device) context = device.make_context() _contexts = {device: context} _generators = {} _pools = {} _cublas_handles = {} cumisc.init(mem_alloc) seed(os.environ.get('CHAINER_SEED')) _pid = pid # mark as initialized atexit.register(shutdown)
def init(): # MAGIC MAGIC from pycuda import driver driver.init() from pycuda.tools import make_default_context context = make_default_context() device = context.get_device() import atexit atexit.register(context.detach) return context
def get_num_gpus_core(): cuda.init() num = 0 while True: try: cuda.Device(num) except: break else: num +=1 return num
def list_devices(): import pycuda.driver as cuda cuda.init() for i in range(cuda.Device.count()): dev = cuda.Device(i) attrs = dev.get_attributes() print 'Device %d (%s): compute %d.%d, free mem %d, PCI %s' % ( i, dev.name(), attrs[cuda.device_attribute.COMPUTE_CAPABILITY_MAJOR], attrs[cuda.device_attribute.COMPUTE_CAPABILITY_MINOR], dev.total_memory(), dev.pci_bus_id())
def main(): figsaved = True if not figsaved: #1. parsing sys arguments import sys import subnets.layers.someconfigs as someconfigs try: device = sys.argv[1] except: print 'USAGE: python tsne.py [device]' print 'example: device=cuda0' raise #2.initialize devices if device.startswith('gpu'): backend = 'cudandarray' someconfigs.backend = 'cudandarray' else: backend = 'gpuarray' someconfigs.backend = 'gpuarray' gpuid = int(device[-1]) if backend == 'cudandarray': import pycuda.driver as drv drv.init() dev = drv.Device(gpuid) ctx = dev.make_context() import theano.sandbox.cuda theano.sandbox.cuda.use(device) # import pycuda.gpuarray as gpuarray # #import theano # import theano.misc.pycuda_init # import theano.misc.pycuda_utils else: import os if 'THEANO_FLAGS' in os.environ: raise ValueError('Use theanorc to set the theano config') os.environ['THEANO_FLAGS'] = 'device={0}'.format(device) import theano.gpuarray ctx = theano.gpuarray.type.get_context(None) # from pygpu import collectives #--- # 3. create save_path and info_matrix for loading samples import time date = '%d-%d' % (time.gmtime()[1], time.gmtime()[2]) import os pid = os.getpid() save_path = './fig-%s-%d/' % (date, pid) if not os.path.exists(save_path): os.makedirs(save_path) print 'create dir', save_path data = load_testset() / 255. base_path = '/scratch/mahe6562/gap/' #base_path = '/work/imj/gap/' #model_path = 'grans/lsun_reo/swp0.025-8-gpuarray-1397924_genclipped/' #model_path = 'gran-lsun-nccl/11-19-swp0.1-8-gpuarray-1526900/' model_path = 'dcgan-lsun-nccl/11-17-swp0.1-8-gpuarray-1255492/' model_path1 = 'combined-lsun-nccl/11-19-swp0.1-4-gpuarray-658562/' info_matrix = [[base_path + model_path, 8, 4, 0, 34], [base_path + model_path, 8, 4, 4, 34], [base_path + model_path1, 4, 2, 0, 34]] mname = 'DCGAN' # 4. load samples based on info_matrix samples_single = load_single(*info_matrix[0]) samples_gap = load_single(*info_matrix[1]) samples_gap_comb = load_single(*info_matrix[2]) # 5. tsne based on loaded samples alldata, fig = tsne(mname, data, samples_single, samples_gap, samples_gap_comb, verbose=True) np.save(save_path + '/all%s.npy' % mname, alldata) fig.savefig(save_path + '/t-SNE%s.pdf' % mname, format='pdf') alldata = np.load(save_path + '/all%s.npy' % mname) plot_separate(alldata, mname='DCGAN') else: import time date = '%d-%d' % (time.gmtime()[1], time.gmtime()[2]) save_path = './fig-11-26-51116/' #'./fig-%s/' % date mname = 'DCGAN' alldata = np.load(save_path + '/all%s.npy' % mname) plot_separate(alldata, mname='DCGAN')
def gen_backend(model=None, gpu=None, nrv=False, datapar=False, modelpar=False, flexpoint=False, rng_seed=None, numerr_handling=None, half=False, stochastic_round=0, device_id=None): """ Construct and return a backend instance of the appropriate type based on the arguments given. With no parameters, a single CPU core, float32 backend is returned. Arguments: model (neon.models.model.Model): The instantiated model upon which we will utilize this backend. gpu (string, optional): Attempt to utilize a CUDA capable GPU if installed in the system. Defaults to None which implies a CPU based backend. If 'cudanet', utilize a cuda-convnet2 based backed, which supports Kepler and Maxwell GPUs with single precision. If 'nervanagpu', attempt to utilize the NervanaGPU Maxwell backend with float16 and float32 support. nrv (bool, optional): If True, attempt to utilize the Nervana Engine for computation (must be installed on the system). Defaults to False which implies a CPU based backend. datapar (bool, optional): Set to True to ensure that data is partitioned and each chunk is processed in parallel on different compute cores. Requires mpi4py. Defaults to False which implies that all data will be processed sequentially on a single compute core. modelpar (bool, optional): Set to True to ensure that the nodes in each model layer are partitioned and distributed across multiple compute cores. Requires mpi4py. Defaults to False which implies that all nodes in all model layers will be processed by the same single compute core. flexpoint (bool, optional): If True, attempt to use FlexPoint(TM) element typed data instead of the default float32 which is in place if set to False. rng_seed (numeric, optional): Set this to a numeric value which can be used to seed the random number generator of the instantiated backend. Defaults to None, which doesn't explicitly seed (so each run will be different) stochastic_round (numeric, optional): Only affects the max backend. If 1, perform stochastic rounding. If 0, round to nearest. numerr_handling (dict, optional): Dictate how numeric errors are displayed and handled. The keys and values permissible for this dict match that seen in numpy.seterr. If set to None (the default), behavior is equivalent to {'all': 'warn'} device_id (numeric, optional): Set this to a numeric value which can be used to select which device to run the process on Returns: Backend: newly constructed backend instance of the specifed type. Notes: * Attempts to construct a GPU instance without a CUDA capable card or without cudanet or nervanagpu package installed will cause the program to display an error message and exit. * Attempts to construct a parallel instance without mpi4py installed will cause the program to display an error message and exit. * The returned backend will still need to call its par.init_model() at some point after the model has been linked, in order for parallel training to proceed. """ logger = logging.getLogger(__name__) gpuflag = False if datapar and modelpar: raise NotImplementedError('Hybrid parallelization scheme not ' 'implemented yet. Try with at most one of' 'datapar or modelpar') if modelpar: par = ModelPar() elif datapar: par = DataPar() else: par = NoPar() if par.device_id is not None: if device_id is not None: logger.warn('Ignoring device id specified in command line.') device_id = par.device_id if gpu is not None: gpu = gpu.lower() if sys.platform.startswith("linux"): gpuflag = (os.system("nvcc --version > /dev/null 2>&1") == 0) elif sys.platform.startswith("darwin"): gpuflag = ( os.system("kextstat | grep -i cuda > /dev/null 2>&1") == 0) if gpuflag and gpu == 'cudanet': try: import cudanet # noqa from neon.backends.cc2 import GPU be_name = 'Cudanet' be = GPU(rng_seed=rng_seed, device_id=device_id) except ImportError: logger.warning("cudanet not found, can't run via GPU") gpuflag = False elif gpuflag and gpu == 'nervanagpu': try: import nervanagpu # noqa try: # import pycuda.autoinit import pycuda.driver as drv drv.init() device_id = device_id if device_id is not None else 0 global ctx ctx = drv.Device(device_id).make_context() import atexit atexit.register(ctx.pop) from neon.backends.gpu import GPU be_name = 'NervanaGPU' be = GPU(rng_seed=rng_seed, stochastic_round=stochastic_round, device_id=device_id) except ImportError: logger.warning("pycuda error, can't run via GPU") gpuflag = False except ImportError: logger.warning("nervanagpu not found, can't run via GPU") gpuflag = False if gpuflag is False: raise RuntimeError("Can't find CUDA capable GPU") elif nrv: nrv = False try: from umd.nrv_backend import NRVBackend nrv = True except ImportError: logger.warning("Nervana Engine system software not found") if flexpoint: logger.warning("Flexpoint(TM) backend not currently available") if nrv: be_name = 'NRV' be = NRVBackend(rng_seed=rng_seed, seterr_handling=numerr_handling, device_id=device_id) elif not gpuflag: be_name = 'CPU' be = CPU(rng_seed=rng_seed, seterr_handling=numerr_handling) logger.info("{} backend, RNG seed: {}, numerr: {}".format( be_name, rng_seed, numerr_handling)) par.associate(be) return be
def __init__(self): self.devices, self.device = [], None cu_driver.init() for gpu_i in range(cu_driver.Device.count()): self.devices.append(cu_driver.Device(gpu_i))
def _diffusion_child(comm, bm=None): rank = comm.Get_rank() ngpus = comm.Get_size() nodename = socket.gethostname() name = '%s %s' %(nodename, rank) print(name) if rank == 0: # reduce blocksize bm.data = np.copy(bm.data[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x], order='C') bm.labelData = np.copy(bm.labelData[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x], order='C') # domain decomposition sizeofblocks = (bm.argmax_z - bm.argmin_z) // ngpus blocks = [0] for k in range(ngpus-1): block_temp = blocks[-1] + sizeofblocks blocks.append(block_temp) blocks.append(bm.argmax_z - bm.argmin_z) print('blocks =', blocks) # read labeled slices if bm.label.allaxis: tmp = np.swapaxes(bm.labelData, 0, 1) tmp = np.ascontiguousarray(tmp) indices_01, _ = read_labeled_slices_allx(tmp) tmp = np.swapaxes(tmp, 0, 2) tmp = np.ascontiguousarray(tmp) indices_02, _ = read_labeled_slices_allx(tmp) # send data to childs for destination in range(ngpus-1,-1,-1): # ghost blocks blockmin = blocks[destination] blockmax = blocks[destination+1] datablockmin = blockmin - 100 datablockmax = blockmax + 100 datablockmin = 0 if datablockmin < 0 else datablockmin datablockmax = (bm.argmax_z - bm.argmin_z) if datablockmax > (bm.argmax_z - bm.argmin_z) else datablockmax datablock = np.copy(bm.data[datablockmin:datablockmax], order='C') labelblock = np.copy(bm.labelData[datablockmin:datablockmax], order='C') # read labeled slices if bm.label.allaxis: labelblock = labelblock.astype(np.int32) labelblock[:blockmin - datablockmin] = -1 labelblock[blockmax - datablockmin:] = -1 indices_child, labels_child = [], [] indices_00, labels_00 = read_labeled_slices_allx(labelblock) indices_child.append(indices_00) labels_child.append(labels_00) tmp = np.swapaxes(labelblock, 0, 1) tmp = np.ascontiguousarray(tmp) labels_01 = np.zeros((0, tmp.shape[1], tmp.shape[2]), dtype=np.int32) for slcIndex in indices_01: labels_01 = np.append(labels_01, [tmp[slcIndex]], axis=0) indices_child.append(indices_01) labels_child.append(labels_01) tmp = np.swapaxes(tmp, 0, 2) tmp = np.ascontiguousarray(tmp) labels_02 = np.zeros((0, tmp.shape[1], tmp.shape[2]), dtype=np.int32) for slcIndex in indices_02: labels_02 = np.append(labels_02, [tmp[slcIndex]], axis=0) indices_child.append(indices_02) labels_child.append(labels_02) else: labelblock[:blockmin - datablockmin] = 0 labelblock[blockmax - datablockmin:] = 0 indices_child, labels_child = read_labeled_slices(labelblock) # print indices of labels print('indices child %s:' %(destination), indices_child) if destination > 0: blocks_temp = blocks[:] blocks_temp[destination] = blockmin - datablockmin blocks_temp[destination+1] = blockmax - datablockmin dataListe = splitlargedata(datablock) sendToChild(comm, indices_child, destination, dataListe, labels_child, bm.label.nbrw, bm.label.sorw, blocks_temp, bm.label.allaxis, bm.allLabels, bm.label.smooth, bm.label.uncertainty, bm.platform) else: # select platform if bm.platform == 'cuda': import pycuda.driver as cuda cuda.init() dev = cuda.Device(rank) ctx, queue = dev.make_context(), None if bm.label.allaxis: from biomedisa_features.random_walk.pycuda_large_allx import walk else: from biomedisa_features.random_walk.pycuda_large import walk else: ctx, queue = _get_device(bm.platform, rank) from biomedisa_features.random_walk.pyopencl_large import walk # run random walks tic = time.time() memory_error, final, final_uncertainty, final_smooth = walk(comm, datablock, labels_child, indices_child, bm.label.nbrw, bm.label.sorw, blockmin-datablockmin, blockmax-datablockmin, name, bm.allLabels, bm.label.smooth, bm.label.uncertainty, ctx, queue) tac = time.time() print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds') # free device if bm.platform == 'cuda': ctx.pop() del ctx if memory_error: print('GPU out of memory. Image too large.') else: # gather data for source in range(1, ngpus): lendataListe = comm.recv(source=source, tag=0) for l in range(lendataListe): data_z, data_y, data_x = comm.recv(source=source, tag=10+(2*l)) receivedata = np.empty((data_z, data_y, data_x), dtype=np.uint8) comm.Recv([receivedata, MPI.BYTE], source=source, tag=10+(2*l+1)) final = np.append(final, receivedata, axis=0) # save finals final2 = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final2[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final final2 = final2[1:-1, 1:-1, 1:-1] save_data(bm.path_to_final, final2, bm.header, bm.final_image_type, bm.label.compression) # uncertainty if final_uncertainty is not None: final_uncertainty *= 255 final_uncertainty = final_uncertainty.astype(np.uint8) for source in range(1, ngpus): lendataListe = comm.recv(source=source, tag=0) for l in range(lendataListe): data_z, data_y, data_x = comm.recv(source=source, tag=10+(2*l)) receivedata = np.empty((data_z, data_y, data_x), dtype=np.uint8) comm.Recv([receivedata, MPI.BYTE], source=source, tag=10+(2*l+1)) final_uncertainty = np.append(final_uncertainty, receivedata, axis=0) # save finals final2 = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final2[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_uncertainty final2 = final2[1:-1, 1:-1, 1:-1] save_data(bm.path_to_uq, final2, compress=bm.label.compression) # smooth if final_smooth is not None: for source in range(1, ngpus): lendataListe = comm.recv(source=source, tag=0) for l in range(lendataListe): data_z, data_y, data_x = comm.recv(source=source, tag=10+(2*l)) receivedata = np.empty((data_z, data_y, data_x), dtype=np.uint8) comm.Recv([receivedata, MPI.BYTE], source=source, tag=10+(2*l+1)) final_smooth = np.append(final_smooth, receivedata, axis=0) # save finals final2 = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final2[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth final2 = final2[1:-1, 1:-1, 1:-1] save_data(bm.path_to_smooth, final2, bm.header, bm.final_image_type, bm.label.compression) # print computation time t = int(time.time() - bm.TIC) if t < 60: time_str = str(t) + ' sec' elif 60 <= t < 3600: time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec' elif 3600 < t: time_str = str(t // 3600) + ' h ' + str((t % 3600) // 60) + ' min ' + str(t % 60) + ' sec' print('Computation time:', time_str) else: lendataListe = comm.recv(source=0, tag=0) for k in range(lendataListe): data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=10+(2*k)) if k==0: data = np.zeros((0, data_y, data_x), dtype=data_dtype) data_temp = np.empty((data_z, data_y, data_x), dtype=data_dtype) if data_dtype == 'uint8': comm.Recv([data_temp, MPI.BYTE], source=0, tag=10+(2*k+1)) else: comm.Recv([data_temp, MPI.FLOAT], source=0, tag=10+(2*k+1)) data = np.append(data, data_temp, axis=0) nbrw, sorw, allx, smooth, uncertainty, platform = comm.recv(source=0, tag=1) if allx: labels = [] for k in range(3): lenlabelsListe = comm.recv(source=0, tag=2+k) for l in range(lenlabelsListe): labels_z, labels_y, labels_x = comm.recv(source=0, tag=100+(2*k)) if l==0: labels_tmp = np.zeros((0, labels_y, labels_x), dtype=np.int32) tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([tmp, MPI.INT], source=0, tag=100+(2*k+1)) labels_tmp = np.append(labels_tmp, tmp, axis=0) labels.append(labels_tmp) else: lenlabelsListe = comm.recv(source=0, tag=2) for k in range(lenlabelsListe): labels_z, labels_y, labels_x = comm.recv(source=0, tag=100+(2*k)) if k==0: labels = np.zeros((0, labels_y, labels_x), dtype=np.int32) tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([tmp, MPI.INT], source=0, tag=100+(2*k+1)) labels = np.append(labels, tmp, axis=0) allLabels = comm.recv(source=0, tag=99) indices = comm.recv(source=0, tag=8) blocks = comm.recv(source=0, tag=9) blockmin = blocks[rank] blockmax = blocks[rank+1] # select platform if platform == 'cuda': import pycuda.driver as cuda cuda.init() dev = cuda.Device(rank) ctx, queue = dev.make_context(), None if allx: from biomedisa_features.random_walk.pycuda_large_allx import walk else: from biomedisa_features.random_walk.pycuda_large import walk else: ctx, queue = _get_device(platform, rank) from biomedisa_features.random_walk.pyopencl_large import walk # run random walks tic = time.time() memory_error, final, final_uncertainty, final_smooth = walk(comm, data, labels, indices, nbrw, sorw, blockmin, blockmax, name, allLabels, smooth, uncertainty, ctx, queue) tac = time.time() print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds') # free device if platform == 'cuda': ctx.pop() del ctx # send finals if not memory_error: dataListe = splitlargedata(final) comm.send(len(dataListe), dest=0, tag=0) for k, dataTemp in enumerate(dataListe): dataTemp = dataTemp.copy(order='C') comm.send([dataTemp.shape[0], dataTemp.shape[1], dataTemp.shape[2]], dest=0, tag=10+(2*k)) comm.Send([dataTemp, MPI.BYTE], dest=0, tag=10+(2*k+1)) if final_uncertainty is not None: final_uncertainty *= 255 final_uncertainty = final_uncertainty.astype(np.uint8) dataListe = splitlargedata(final_uncertainty) comm.send(len(dataListe), dest=0, tag=0) for k, dataTemp in enumerate(dataListe): dataTemp = dataTemp.copy(order='C') comm.send([dataTemp.shape[0], dataTemp.shape[1], dataTemp.shape[2]], dest=0, tag=10+(2*k)) comm.Send([dataTemp, MPI.BYTE], dest=0, tag=10+(2*k+1)) if final_smooth is not None: dataListe = splitlargedata(final_smooth) comm.send(len(dataListe), dest=0, tag=0) for k, dataTemp in enumerate(dataListe): dataTemp = dataTemp.copy(order='C') comm.send([dataTemp.shape[0], dataTemp.shape[1], dataTemp.shape[2]], dest=0, tag=10+(2*k)) comm.Send([dataTemp, MPI.BYTE], dest=0, tag=10+(2*k+1))
def gpu_init(device_no, pick_up, pickup_dirs): cuda.init() dev = cuda.Device(device_no) # the number of GPU ctx = dev.make_context() kwargs = {"h": h, "hsnham": hsnham, "VISCOINDX" : VISCOINDX, "H_xomoi": H_moi, "bienQ" : bienQ,\ "moci" : moci ,"mocj" : mocj, "dauj": dauj, "daui" : daui, "cuoii" : cuoii, "cuoij" : cuoij,\ "Tsxw" : Tsxw, "Tsyw" : Tsyw, "khouot" : khouot, "boundary_type" : boundary_type,\ "u": u, "v": v, "z" : z, "t_u": t_u, "t_v": t_v, "t_z": t_z, "Htdu": Htdu, "Htdv" : Htdv, \ "Kx1" : Kx1, "Ky1" : Ky1, "htaiz" : htaiz, "htaiz_bd" : htaiz,\ "bc_up": bc_up, "bc_down": bc_down, "bc_left": bc_left, "bc_right" : bc_right,\ "ubt" : ubt, "ubp" : ubp, "vbt" : vbt, "vbd" : vbd, "hi": hi,\ "FS" : FS, 'tFS': tFS, 'CC_u' : CC_u, 'CC_d' : CC_d, 'CC_l' : CC_l, 'CC_r' : CC_r,\ "VTH": VTH, "Kx" : Kx, "Ky" : Ky, "Fw" : Fw, "Qbx" : Qbx, "Qby" : Qby, "dH" : dH} # create a pointer object that store address of pointers on device pointers = Pointers(ctx, dtype=np.float64, **kwargs) # pointers = Pointers(ctx,**kwargs) # hmax is used to calculate boundary condition, this will be recalculated later on # in pre_processing kernel hmax = np.max(h[2]) # print hmax # allocate memory on device pd = pointers.alloc_on_device_only(N, M) pc = pointers.alloc() # store pointers on a list to transfer it to gpu # hmax here are just dummie values, for address alignment # so that pointers of other arrays can be copied to the right place in memory global_attributes = [np.int32(M), np.int32(N), floattype(hmax), floattype(hmax), floattype(hmax), floattype(hmax),\ pc['bienQ'], pc['daui'], pc['dauj'], pc['cuoii'], pc['cuoij'], pc['moci'], pc['mocj'], pc['khouot'], pc['boundary_type'],\ pc['h'], pc['v'], pc['u'], pc['z'], pc['t_u'], pc['t_v'], pc['t_z'], pc['Htdu'], pc['Htdv'], pc['H_moi'], pc['htaiz'],\ pc['htaiz_bd'], pc['ubt'], pc['ubp'], pc['vbt'], pc['vbd'], \ pc['hsnham'], pc['VISCOINDX'], pc['Kx1'], pc['Ky1'], pc['Tsyw'], pc['Tsxw'],\ pc['bc_up'], pc['bc_down'], pc['bc_left'], pc['bc_right'], pc['hi'],\ pc['FS'], pc['tFS'], pc['CC_u'], pc['CC_d'], pc['CC_l'], pc['CC_r'],\ pc['VTH'], pc['Kx'], pc['Ky'], pc['Fw'], pc['Qbx'], pc['Qby'], pc['dH']] auxilary_arrays = [pd['a1'], pd['b1'], pd['c1'], pd['d1'], pd['a2'], pd['c2'], pd['d2'], \ pd['f1'], pd['f2'], pd['f3'], pd['f5'],\ pd['AA'], pd['BB'], pd['CC'], pd['DD'],\ pd['x'], pd['Ap'], pd['Bp'], pd['ep'], pd['SN'] ] # copy struct to gpu: struct that store attribute arrays arg_struct_ptr = cuda.mem_alloc( np.intp(0).nbytes * (len(global_attributes) - 6) + 8 + 4 * np.dtype(floattype).itemsize) # copy struct to gpu: struct that store supporting arrays (i.e. arrays that only exist on device and don't have corresponding arrays on host) arr_struct_ptr = cuda.mem_alloc(np.intp(0).nbytes * len(auxilary_arrays)) arg_struct = PointersStruct(global_attributes, arg_struct_ptr) arr_struct = PointersStruct(auxilary_arrays, arr_struct_ptr, structtype='ARR') pointers.toDevice(['h', 'hsnham', 'VISCOINDX', 'bienQ', 'Tsyw', 'Tsxw', 'boundary_type', 'bc_up', 'bc_down', 'bc_left', 'bc_right',\ 'u', 'v', 'z', 'CC_u', 'CC_d', 'CC_l', 'CC_r', 'Fw']) ctx.synchronize() supplement = open("support_funcs.cu").read() supmod = SourceModule(supplement, include_dirs=[os.getcwd()]) # get functions from cuda file init_Kernel = supmod.get_function("Onetime_init") Find_Calculation_limits_x = supmod.get_function( "Find_Calculation_limits_Horizontal") Find_Calculation_limits_y = supmod.get_function( "Find_Calculation_limits_Vertical") gpu_Htuongdoi = supmod.get_function("Htuongdoi") preprocess = supmod.get_function("preprocess_data") # declare block size and grid size block_2d = (min(32, M + 3), 1, 1) grid_2d = ((M + 3) // min(32, M + 3) + 1, N + 3, 1) # call intialize kernels init_Kernel(arg_struct_ptr, block=block_2d, grid=grid_2d) ctx.synchronize() if pick_up is True: load_intial_condition(dirs, pointers) Find_Calculation_limits_x(arg_struct_ptr, block=(32, 1, 1), grid=(1, N, 1)) Find_Calculation_limits_y(arg_struct_ptr, block=(32, 1, 1), grid=(1, M, 1)) gpu_Htuongdoi(arg_struct_ptr, block=block_2d, grid=grid_2d) ctx.synchronize() preprocess(arg_struct_ptr, block=(32, 1, 1), grid=(1, 1, 1)) return pointers, ctx, arg_struct_ptr, arr_struct_ptr, supmod
def initialise_cuda(): global MAXGPU if not pycuda.isinitialised: drv.init() pycuda.isinitialised = True MAXGPU = drv.Device.count()
def fun_mlp(shared_args, private_args, this_queue, that_queue): ''' shared_args contains neural network parameters private_args contains parameters for process run on each gpu this_queue and that_queue are used for synchronization between processes. ''' learning_rate = shared_args['learning_rate'] n_epochs = shared_args['n_epochs'] dataset = shared_args['dataset'] batch_size = shared_args['batch_size'] L1_reg = shared_args['L1_reg'] L2_reg = shared_args['L2_reg'] n_hidden = shared_args['n_hidden'] #### # pycuda and zmq environment drv.init() dev = drv.Device(private_args['ind_gpu']) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) if private_args['flag_client']: sock.connect('tcp://localhost:5000') else: sock.bind('tcp://*:5000') #### #### # import theano related import theano.sandbox.cuda theano.sandbox.cuda.use(private_args['gpu']) import theano import theano.tensor as T from logistic_sgd import load_data from mlp import MLP import theano.misc.pycuda_init import theano.misc.pycuda_utils #### datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #### # setting pycuda and # pass handles, only done once param_ga_list = [] # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu param_other_list = [] # a list of theano shared variables that are used to store values of theano shared variable from the other gpu param_ga_other_list = [] # a list of pycuda gpuarrays which point to theano shared variables in param_other_list h_list = [] # a list of pycuda IPC handles shape_list = [] # a list containing shapes of variables in param_ga_list dtype_list = [] # a list containing dtypes of variables in param_ga_list average_fun_list = [] # a list containing theano functions for averaging parameters for param in classifier.params: param_other = theano.shared(param.get_value()) param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) param_ga_other = \ theano.misc.pycuda_utils.to_gpuarray( param_other.container.value) h = drv.mem_get_ipc_handle(param_ga.ptr) average_fun = \ theano.function([], updates=[(param, (param + param_other) / 2.)]) param_other_list.append(param_other) param_ga_list.append(param_ga) param_ga_other_list.append(param_ga_other) h_list.append(h) shape_list.append(param_ga.shape) dtype_list.append(param_ga.dtype) average_fun_list.append(average_fun) # pass shape, dtype and handles sock.send_pyobj((shape_list, dtype_list, h_list)) shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj() param_ga_remote_list = [] # create gpuarray point to the other gpu use the passed information for shape_other, dtype_other, h_other in zip(shape_other_list, dtype_other_list, h_other_list): param_ga_remote = \ gpuarray.GPUArray(shape_other, dtype_other, gpudata=drv.IPCMemoryHandle(h_other)) param_ga_remote_list.append(param_ga_remote) #### ############### # TRAIN MODEL # ############### print '... training' this_queue.put('') that_queue.get() start_time = time.time() epoch = 0 while epoch < n_epochs: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): if minibatch_index % 2 == private_args['mod']: train_model(minibatch_index) this_queue.put('') that_queue.get() # exchanging weights for param_ga, param_ga_other, param_ga_remote in \ zip(param_ga_list, param_ga_other_list, param_ga_remote_list): drv.memcpy_peer( param_ga_other.ptr, param_ga_remote.ptr, param_ga_remote.dtype.itemsize * param_ga_remote.size, ctx, ctx) ctx.synchronize() this_queue.put('') that_queue.get() for average_fun in average_fun_list: average_fun() if private_args['verbose']: validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) end_time = time.time() this_queue.put('') that_queue.get() if private_args['verbose']: print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def beamsim_gpu(k: float = 1000.0, x0: float = 0.1e-3, y0: float = 1e-3, z0: float = 0.0, nx: int = 1, ny: int = 240, nz: int = 160, dx: float = 1.0, dy: float = 1.0e-3, dz: float = 1.0e-3, elements_vectorized=None): if elements_vectorized is None: elements_vectorized = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0] # note, that on a remote worker, there is never a saying where this thread wakes up # therefore i must be extra carefull to always initialize all the resources needed # initialize manually drv.init() # local_device_count = drv.Device.count() # print('found {} GPUs'.format(local_device_count)) # choose one of the GPUs at random gpu_to_take = random.choice(range(drv.Device.count())) # take device 0 for now gpu_context = drv.Device(gpu_to_take).make_context() gpu_context.push() # make the context active code_text = SourceModule(""" #include <stdio.h> #define pi2 2.0f*3.141592653589793f __global__ void BeamSimKernel ( const float *tx, unsigned int tx_length, float *out, float x0, float y0, float z0, unsigned int nx, unsigned int ny, unsigned int nz, float dx, float dy, float dz, float k ) { unsigned int offset=0; unsigned int ix,iy,iz=0; unsigned int itx = 0; // used as a transducer iterator float amplitude = 0.0; // float pressure; // no need for it anymore float distance,kd,pressure_re,pressure_im=0; // float directivity_cos; // directivity_cos not used in this version float pixel_x,pixel_y,pixel_z,dix,diy,diz =0; // di* - delta distances as optimisation // iz=0 for now in CUDA, use 2D calculation only // calculate iz,ix from thread built-in variables ix = blockIdx.x * blockDim.x + threadIdx.x; // use cuda.x-grid as world.x iy = blockIdx.y * blockDim.y + threadIdx.y; // use cuda.y-grid as world.y iz = blockIdx.z * blockDim.z + threadIdx.z; // use cuda.z-grid as world.z // make sure that this thread won't try to calculate non-existent receiver if (iy >= ny) return; if (ix >= nx) return; if (iz >= nz) return; // start actual calculation : zero the accumulators pressure_re = 0; pressure_im = 0; // where am I in space? pixel_x = (float)ix * dx + x0; pixel_y = (float)iy * dy + y0; pixel_z = (float)iz * dz + z0; // debugging code only: // printf("block %d.%d: pixel %d.%d.%d, at %0.3f|%0.3f|%0.3f\\n",blockIdx.y,blockIdx.z, ix, iy, iz, pixel_x,pixel_y,pixel_z); // note that enabling this makes this a long call, time outs the driver // for each transmitter-element, do this: for (itx=0; itx<tx_length*6; itx=itx+6) // this hopefully acesses the same memory location for each thread and therefore will be cached { // calculate distance in cartesian space: dix = (pixel_x-tx[0+itx]); // tx.x diy = (pixel_y-tx[1+itx]); // tx.y diz = (pixel_z-tx[2+itx]); // tx.z distance = sqrtf( dix*dix + diy*diy + diz*diz ); // amplitude decays with distance as the energy gets distributed on a ring around the transmit point // note that ring is for 2D space, and a sphere surface would be more appropriate for 3D space amplitude = tx[3+itx] / ( pi2 * distance ); // amplitude is at itx+3 kd = -k * distance + tx[4 + itx]; // phase is at itx+4 // accumulate the energy pressure_re = pressure_re + __cosf(kd) * amplitude; pressure_im = pressure_im + __sinf(kd) * amplitude; } // write the result: // in case if I want the absolute pressure only (discards the phase) // pressure=sqrtf(pressure_re*pressure_re+pressure_im*pressure_im); // in CUDA, i need to calculate rx array memory offset manually for each thread: // offset=ix+iy*nx+iz*nx*ny; // that's a version for xyz, real-only numbers (e.g. pycuda.float32 version // out[offset]=(float)pressure; // this is a version for complex numbers: pycuda.complex64 offset=2*(iz+iy*nz+ix*nz*ny); // that's a version for xyz version out[offset]=(float)pressure_re; offset++; // go to the imaginary value pointer out[offset]=(float)pressure_im; } """) # instantiate the code into the compiler beamsim_kernel = code_text.get_function("BeamSimKernel") # convert the values from pythonic to Cudific ck = numpy.float32(k) cx0 = numpy.float32(x0) cy0 = numpy.float32(y0) cz0 = numpy.float32(z0) cnx = numpy.int32(nx) cny = numpy.int32(ny) cnz = numpy.int32(nz) cdx = numpy.float32(dx) cdy = numpy.float32(dy) cdz = numpy.float32(dz) ctx = numpy.asarray(elements_vectorized).astype(numpy.float32) ctx_count = numpy.int32(len(ctx) / 6) # note: must reserve the output memory right here # note: for 2D values, the x must be == 1 assert(nx == 1), 'this version supports nx=1 only' # prevent from the image size being too large assert(ny < 4096+1), ' ny too large: reduce calculated field pixel count' assert(nz < 4096+1), ' nz too large: reduce calculated field pixel count' # prevent from the transducer description to be too large - you can remove this limitation later on assert(ctx_count < 81920+1),'too many radiator points. Reduce simulation complexity' cuda_out = numpy.zeros((int(cny), int(cnz))).astype(numpy.complex64) # prepare the GPU call : thread wave shape: threads_x = 1 threads_y = 16 threads_z = 64 blocks_x = 1 blocks_y = int((int(cny) / threads_y) + 1) blocks_z = int((int(cnz) / threads_z) + 1) # start the timer! # time_1 = time.clock() beamsim_kernel( drv.In(ctx), ctx_count, drv.Out(cuda_out), cx0, cy0, cz0, cnx, cny, cnz, cdx, cdy, cdz, ck, block=(threads_x, threads_y, threads_z), grid=(blocks_x, blocks_y, blocks_z)) # time_2 = time.clock() # release the GPU from this thread # release the context, otherwise memory leak might occur gpu_context.pop() gpu_context.detach() # performance = numpy.float64(numpy.int128(cnx)*numpy.int128(cny)*numpy.int128(cnz)*numpy.int128(ctx_count)) / (time_2 - time_1) return cuda_out
def cuebeamlambert( elements_vectorized=None, k: float = 1000.0, lambert_radius: float = 100e-3, lambert_map_density: float = 100): # print("lambert_radius = {}, lambert_map_density={}".format(lambert_radius,lambert_map_density)) # matlab:: [img_lambert lambert_x lambert_y lambert_z] = cueBeam.cueBeam_lambert(tx',enviroment.wavenumber,lambert_radius,lambert_map_density); t0 = time.time() if elements_vectorized is None: elements_vectorized = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0] # note, that on a remote worker, there is never a saying where this thread wakes up # therefore i must be extra carefull to always initialize all the resources needed drv_init_time_before = time.time()-t0 # initialize manually drv.init() # local_device_count = drv.Device.count() # print('found {} GPUs'.format(local_device_count)) # choose one of the GPUs at random gpu_to_take = random.choice(range(drv.Device.count())) gpu_context = drv.Device(gpu_to_take).make_context() gpu_context.push() # make the context active drv_init_time_after = time.time() - t0 code_text_lambert=SourceModule(""" #include <stdio.h> #define pi2 2.0f*3.141592653589793f __global__ void BeamsimLambertKernel ( float *tx, unsigned int tx_length, float *out, unsigned int n, float d, float r, float k,float *xp, float *yp, float *zp) { unsigned int offset=0; unsigned int ix,iy,itx=0; float pressure,distance,kd,pressure_re,pressure_im=0; float dist2=0; float dix,diy,diz,lambert_x,lambert_y,lambert_z=0; float xbase,ybase,rho2,rhoi,cosphi,sinphi,cosl,sinl=0; float xbase0=-sqrtf((float)2)+(float)1e-8; // calculate ix,iy from thread built-in variables ix = blockIdx.x * blockDim.x + threadIdx.x; iy = blockIdx.y * blockDim.y + threadIdx.y; //ix=0; // debug // //C// for (iy=0; iy<ny; iy++) //C// for (ix=0; ix<nx; ix++) // make sure that this thread won't try to calculate non-existent receiver if (iy>n) return; if (ix>n) return; // start actual calculation pressure_re=0; pressure_im=0; xbase=(float)ix*d+xbase0; ybase=(float)iy*d+xbase0; // it would be an optimisation not to recalculate it for each pixel, this has to stay here be due to future port to CUDA where each pixel has it's own thread rho2=xbase*xbase+ybase*ybase; offset=ix+n*iy; if (rho2>(float)2) { out[2*offset]=0; out[2*offset+1]=0; // note, complex valued output xp[offset]=0; yp[offset]=0; zp[offset]=0; return; } rhoi=rsqrtf(rho2); cosl=-ybase*rhoi; cosphi=sqrtf(rho2-rho2*rho2/(float)4); lambert_x=r*cosl*cosphi; sinl=xbase*rhoi; lambert_y=r*sinl*cosphi; sinphi=(float)1-rho2/(float)2; lambert_z=r*sinphi; xp[offset]=lambert_x; yp[offset]=lambert_y; zp[offset]=lambert_z; for (itx=0; itx<tx_length*6; itx=itx+6) // this hopefully acesses the same memory location for each thread and therefore will be cached { // distance=single(sqrt( (ix*dx+x0-tx(1,itx)).^2 + (iy*dy+y0-tx(2,itx)).^2 + (iz*dz+z0-tx(3,itx)).^2 )); dix=(lambert_x-tx[0+itx]); diy=(lambert_y-tx[1+itx]); diz=(lambert_z-tx[2+itx]); distance=sqrtf( dix*dix + diy*diy + diz*diz ); kd=-k*distance+tx[5+itx]; dist2=tx[4+itx]/(6.283185307179586f*distance); //equals 2*pi pressure_re=pressure_re+__cosf(kd)*dist2; pressure_im=pressure_im+__sinf(kd)*dist2; // note: __sinf is an simlpified sin function that yields less accurate result. May need to switch to full sin for final product, ok for testing for now // note 2: function sincosf(...) may be faster in this case - calculates both sin and cos. but since it requires additional accumulators, // a detailed test will be required to find out what's faster. } // mem write // pressure=sqrtf(pressure_re*pressure_re+pressure_im*pressure_im); // in CUDA, i need to calculate rx array memory offset manually for each thread: //offset=ix+nx*iy+(ny*nx)*iz; // out[offset]=(float)pressure; //left for debug // offset=2*(ix+n*iy); // note, complex-valued version A out[2*offset+0]=(float)pressure_re; // use real-to-complex offset conversion offset++; // go to the imaginary value pointer out[2*offset+1]=(float)pressure_im; } """) # instantiate the code into the compiler beamsim_lambert_kernel = code_text_lambert.get_function("BeamsimLambertKernel") kernel_init_time = time.time() - t0 # calc basic space properties npts = float(math.ceil(6.283185307179586 * lambert_radius / lambert_map_density)) d = 2.0 * math.sqrt(2)/npts n = 1 + math.ceil(2*math.sqrt(2)/d) # convert the values from pythonic to Cudific ctx = numpy.asarray(elements_vectorized).astype(numpy.float32) ctx_count = numpy.int32(len(ctx) / 6) cn = numpy.int32(n) cd = numpy.float32(d) cr = numpy.float32(lambert_radius) ck = numpy.float32(k) cuda_out_xp = numpy.zeros((int(n), int(n))).astype(numpy.float32) cuda_out_yp = numpy.zeros((int(n), int(n))).astype(numpy.float32) cuda_out_zp = numpy.zeros((int(n), int(n))).astype(numpy.float32) cuda_out = numpy.zeros((int(n), int(n))).astype(numpy.complex64) # note: must reserve the output memory right here # prevent from the transducer description to be too large - you can remove this limitation later on assert (ctx_count < 300001), "transducer definition too large" # prepare the GPU call : thread wave shape: threads_x = 16 threads_y = 64 threads_z = 1 blocks_x = int((int(n) / threads_x) + 1) blocks_y = int((int(n) / threads_y) + 1) blocks_z = 1 kernel_prepare_time = time.time() - t0 beamsim_lambert_kernel( drv.In(ctx), ctx_count, drv.Out(cuda_out), cn, cd, cr, ck, drv.Out(cuda_out_xp), drv.Out(cuda_out_yp), drv.Out(cuda_out_zp), block=(threads_x, threads_y, threads_z), grid=(blocks_x, blocks_y, blocks_z)) kernel_run_time=time.time() - t0 # release the GPU from this thread # release the context, otherwise memory leak might occur gpu_context.pop() gpu_context.detach() detach_time = time.time() - t0 # calculate performance metrics. This is usefull for debugging the distributed computation system dinittime = drv_init_time_after - drv_init_time_before dkernel_init_time = kernel_init_time - drv_init_time_after dkernel_prepare_time = kernel_prepare_time - kernel_init_time dkernel_run_time = kernel_run_time - kernel_prepare_time dkernel_detach_time = detach_time - kernel_run_time print("lambert: init:{:06.4f}, kernel_init:{:06.4f}, kernel_prepare:{:06.4f}, kernel_run:{:06.4f}, detach:{:06.4f}".format(dinittime,dkernel_init_time,dkernel_prepare_time,dkernel_run_time,dkernel_detach_time)) # finally... return cuda_out, cuda_out_xp, cuda_out_yp, cuda_out_zp
def image_iterator_gpu(image_volume, roi=None, radius=2, gray_levels=None, binwidth=None, dx=1, dy=0, dz=0, ndev=2, cadd=(0, 0, 0), sadd=3, csub=(0, 0, 0), ssub=3, i=0, fixed_start=-250, fixed_end=350, feature_kernel='kernel_glcm', stat_name='stat_glcm_contrast'): """Uses PyCuda to parallelize the computation of the voxel-wise image entropy using a variable \ neighborhood radius Args: radius -- neighborhood radius; where neighborhood size is isotropic and calculated as 2*radius+1 """ # initialize cuda context cuda.init() cudacontext = cuda.Device(NVDEVICE).make_context() parent_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(parent_dir, 'local_features.cuh'), mode='r') as f: cuda_template = Template(f.read()) roimask = None if isinstance(image_volume, np.ndarray): toBaseVolume = False logger.debug('recognized as an np.ndarray') if image_volume.ndim == 3: d, r, c = image_volume.shape elif image_volume.ndim == 2: d, r, c = (1, *image_volume.shape) image = image_volume.flatten() # # use stat based GLCM quantization # quantize_mode=QMODE_STAT else: toBaseVolume = True logger.debug('recognized as a BaseVolume') image = image_volume if roi: image = image.conformTo(roi.frameofreference) d, r, c = image.frameofreference.size[::-1] image = image.vectorize() # if not image_volume.modality.lower() == 'ct': # # use stat based GLCM quantization # quantize_mode=QMODE_STAT # mask to roi if (roi): roimask = roi.makeDenseMask().vectorize() logger.debug('d:{:d}, r:{:d}, c:{:d}'.format(d, r, c)) if d == 1: z_radius = 0 elif d > 1: z_radius = radius # enforce quantization mode selection if gray_levels and binwidth: logger.exception( 'must exclusively specify "binwidth" or "gray_levels" to select glcm quantization mode' ) elif binwidth: quantize_mode = QMODE_FIXEDHU nbins = int(math.floor((fixed_end - fixed_start) / binwidth)) + 2 logger.debug( 'quantization using {} fixed bins from {} to {} with spacing {}'. format(nbins, fixed_start, fixed_end, binwidth)) gray_levels = -1 elif gray_levels: warnings.warn( 'QMODE_STAT quantization mode will be deprecated soon in favor of other quantization methods.', DeprecationWarning) quantize_mode = QMODE_STAT nbins = gray_levels binwidth = -1 else: # kernel doesn't use glcm quantize_mode = -1 nbins = 1 gray_levels = -1 binwidth = -1 maxrunlength = math.ceil( math.sqrt(2 * (radius * 2 + 1) * (radius * 2 + 1) + (z_radius * 2 + 1))) cuda_source = cuda_template.substitute({ 'RADIUS': radius, 'Z_RADIUS': z_radius, 'IMAGE_DEPTH': d, 'IMAGE_HEIGHT': r, 'IMAGE_WIDTH': c, 'QUANTIZE_MODE': quantize_mode, 'GRAY_LEVELS': gray_levels, 'FIXED_BINWIDTH': binwidth, 'FIXED_START': fixed_start, 'NBINS': nbins, 'MAXRUNLENGTH': maxrunlength, 'DX': dx, 'DY': dy, 'DZ': dz, 'NDEV': ndev, 'CADD_X': cadd[0], 'CADD_Y': cadd[1], 'CADD_Z': cadd[2], 'SADD': sadd, 'CSUB_X': csub[0], 'CSUB_Y': csub[1], 'CSUB_Z': csub[2], 'SSUB': ssub, 'KERNEL': feature_kernel, 'STAT': stat_name }) mod2 = SourceModule( cuda_source, options=[ '-I {!s}'.format(parent_dir), # '-g', '-G', '-lineinfo' ]) func = mod2.get_function('image_iterator_gpu') # allocate image on device in global memory image = image.astype(np.float32) image_gpu = cuda.mem_alloc(image.nbytes) result = np.zeros_like(image) result_gpu = cuda.mem_alloc(result.nbytes) # transfer image to device cuda.memcpy_htod(image_gpu, image) cuda.memcpy_htod(result_gpu, result) # call device kernel blocksize = 256 gridsize = math.ceil(r * c * d / blocksize) func(image_gpu, result_gpu, block=(blocksize, 1, 1), grid=(gridsize, 1, 1)) # get result from device cuda.memcpy_dtoh(result, result_gpu) # detach from cuda context # cudacontext.synchronize() # cudacontext.detach() cudacontext.pop() # required to successfully free device memory for created context del cudacontext gc.collect() pycuda.tools.clear_context_caches() logger.debug('feature result shape: {!s}'.format(result.shape)) logger.debug('GPU done') # clean invalid values from result result = np.nan_to_num(result) if (roimask is not None): result = np.multiply(result, roimask) if d == 1: result = result.reshape(r, c) elif d > 1: result = result.reshape(d, r, c) if toBaseVolume: if roi: FOR = roi.frameofreference else: FOR = image_volume.frameofreference outvolume = MaskableVolume().fromArray(result, FOR) outvolume.modality = image_volume.modality return outvolume else: return result
def elementwise_composition_gpu(feature_volume_list, comp_type='elementwiseMean'): """computes the elementwise mean of the like-shaped volumes in feature_volume_list""" # initialize cuda context cuda.init() cudacontext = cuda.Device(NVDEVICE).make_context() parent_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(parent_dir, 'feature_compositions.cuh'), mode='r') as f: mod = SourceModule( f.read(), options=[ '-I {!s}'.format(parent_dir), # '-g', '-G', '-lineinfo' ]) func = mod.get_function(comp_type) # combine volumes into linearized array FOR = feature_volume_list[0].frameofreference vols = [] for vol in feature_volume_list: vols.append(vol.vectorize()) array_length = np.product(FOR.size).item() while len(vols) > 1: num_arrays = 2 cat = np.concatenate([vols.pop() for x in range(num_arrays)], axis=0) # allocate image on device in global memory cat = cat.astype(np.float32) cat_gpu = cuda.mem_alloc(cat.nbytes) result = np.zeros((array_length)).astype(np.float32) result_gpu = cuda.mem_alloc(result.nbytes) # transfer cat to device cuda.memcpy_htod(cat_gpu, cat) cuda.memcpy_htod(result_gpu, result) # call device kernel blocksize = 512 gridsize = math.ceil(array_length / blocksize) func(cat_gpu, result_gpu, np.int32(array_length), np.int32(num_arrays), block=(blocksize, 1, 1), grid=(gridsize, 1, 1)) # get result from device cuda.memcpy_dtoh(result, result_gpu) vols.append(result.reshape((-1, 1))) result = vols[0] # detach from cuda context # cudacontext.synchronize() # cudacontext.detach() cudacontext.pop() # required to successfully free device memory for created context del cudacontext gc.collect() pycuda.tools.clear_context_caches() x = MaskableVolume().fromArray(result, FOR) x.modality = feature_volume_list[0].modality return x
def init_cuda(self, X, Y, cls_start, max_kernels=1): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X = X self.Y = Y self.cls_start = cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y = [] #handle to gpu memory for results for each concurrent classifier self.g_out = [] #gpu kernel out self.kernel_out = [] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg = [] #function reference self.func = [] #texture references for each concurrent kernel self.tex_ref = [] #main vectors #gpu self.g_vecI = [] self.g_vecJ = [] #cpu self.main_vecI = [] self.main_vecJ = [] #cpu class self.cls_count = [] self.cls = [] #gpu class self.g_cls_count = [] self.g_cls = [] self.sum_cls = [] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N, self.Dim = X.shape column_size = self.N * 4 cacheMB = self.cache_size * 1024 * 1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB / column_size).astype(int) cache_items = min(self.N, cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open(self.module_file, "r") as CudaFile: module_code = CudaFile.read() #compile module self.module = SourceModule(module_code, keep=True, no_extern_c=True) (g_gamma, gsize) = self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma)) #get functions reference Dim = self.Dim vecBytes = Dim * 4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex = self.module.get_texref('VecI_TexRef') self.g_vecI[f] = cuda.mem_alloc(vecBytes) vecI_tex.set_address(self.g_vecI[f], vecBytes) #init texture for vector J vecJ_tex = self.module.get_texref('VecJ_TexRef') self.g_vecJ[f] = cuda.mem_alloc(vecBytes) vecJ_tex.set_address(self.g_vecJ[f], vecBytes) self.tex_ref.append((vecI_tex, vecJ_tex)) self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32)) self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist) #transform X to particular format v, c, r = spf.csr2ellpack(self.X, align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start)
from __future__ import print_function, division, absolute_import """ Module to handle the cuda runtime environment. """ #system level imports import ctypes import os import math # pycuda imports import pycuda import pycuda.driver as cudadrv # Init cuda cudadrv.init() #package level imports from ppmd.cuda import cuda_config from ppmd import runtime, pio, mpi, abort CUDA_ENABLED = cuda_config.CUDA_CFG['enable-cuda'][1] OPT = cuda_config.CUDA_CFG['opt-level'][1] DEBUG = cuda_config.CUDA_CFG['debug-level'][1] VERBOSE = cuda_config.CUDA_CFG['verbose-level'][1] TIMER = cuda_config.CUDA_CFG['timer-level'][1] BUILD_TIMER = cuda_config.CUDA_CFG['build-timer-level'][1] ERROR_LEVEL = cuda_config.CUDA_CFG['error-level'][1] BUILD_DIR = runtime.BUILD_DIR from . import cuda_build
def __init__(self): cuda.init() if cuda.Device.count() < 0: raise ValueError("No GPU found on this device")
drv.memcpy_htod(gpu_args[0], output_image) #launch the kernel context.synchronize() start.record() convolution(*gpu_args, block=threads, grid=grid, stream=None, shared=0) end.record() context.synchronize() print("convolution_kernel took", end.time_since(start), "ms.") #copy output data back from GPU drv.memcpy_dtoh(output_image, gpu_args[0]) #compare output with reference correct = numpy.allclose(output_image, reference, atol=1e-6) if not correct: print("TEST FAILED!") else: print("TEST PASSED!") if __name__ == "__main__": #init pycuda drv.init() context = drv.Device(0).make_context() try: convolution_example(context) finally: context.pop()
#!/usr/bin/env python3 # A simple class to know about your cuda devices import pycuda.driver as cuda import pycuda.autoinit # Necessary for using its functions cuda.init() # Necesarry for using its functions class aboutCudaDevices(): def __init__(self): pass def num_devices(self): """Return number of devices connected.""" return cuda.Device.count() def devices(self): """Get info on all devices connected.""" num = cuda.Device.count() print("%d device(s) found:"%num) for i in range(num): print(cuda.Device(i).name(), "(Id: %d)"%i) def mem_info(self): """Get available and total memory of all devices.""" available, total = cuda.mem_get_info() print("Available: %.2f GB\nTotal: %.2f GB"%(available/1e9, total/1e9)) def attributes(self, device_id=0): """Get attributes of device with device Id = device_id""" return cuda.Device(device_id).get_attributes()
# Authors: Paul Kienzle, Christopher Metting #03/23/2010 import time import Queue import threading import numpy import numpy.linalg as linalg from . import approximations from ..model.sample_prep import * try: from pycuda import gpuarray import pycuda.driver as cuda from pycuda.compiler import SourceModule cuda.init() cudaFind = True except: print 'Pycuda or Cuda not installed Reverting to CPU calculation' cudaFind = False def readfile(name): file = open(name) txt = file.read() file.close() return txt def loadkernelsrc(name, precision='float32', defines={}): import os
def main(inputSize): """Create a TensorRT engine for ONNX-based YOLOv3-608 and run inference.""" global vs, outputFrame, lock, t0, t1, fps cuda.init() device = cuda.Device(0) onnx_file_path = 'yolov3-tiny-416.onnx' engine_file_path = 'yolov3-tiny-{}.trt'.format(inputSize) h, w = (inputSize, inputSize) # Two-dimensional tuple with the target network's (spatial) input resolution in HW ordered input_resolution_yolov3_HW = (inputSize, inputSize) # Create a pre-processor object by specifying the required input resolution for YOLOv3 preprocessor = PreprocessYOLO(input_resolution_yolov3_HW) # Output shapes expected by the post-processor output_shapes = [(1, 255, 13, 13), (1, 255, 26, 26)] # Do inference with TensorRT cuda.init() # Initialize CUDA ctx = make_default_context() # Create CUDA context postprocessor_args = { "yolo_masks": [(3, 4, 5), (0, 1, 2)], "yolo_anchors": [(10, 14), (23, 27), (37, 58), (81, 82), (135, 169), (344, 319)], "obj_threshold": 0.4, "nms_threshold": 0.5, "yolo_input_resolution": input_resolution_yolov3_HW } postprocessor = PostprocessYOLO(**postprocessor_args) with get_engine(onnx_file_path, engine_file_path ) as engine, engine.create_execution_context() as context: print("performing inference") inputs, outputs, bindings, stream = common.allocate_buffers(engine) while True: trt_outputs = [] #image_raw=vs.read() ret, image_raw = cap.read() if image_raw is not None: image_raw, image = preprocessor.process(image_raw) shape_orig_WH = image_raw.size inputs[0].host = image t0 = time.time() trt_outputs = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) trt_outputs = [ output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes) ] boxes, classes, scores = postprocessor.process( trt_outputs, (shape_orig_WH), 0) t1 = time.time() t_inf = t1 - t0 fps = 1 / t_inf draw = True if (boxes is None): print("no bboxes") draw = False if (classes is None): print("no classes") draw = False if (scores is None): print("no scores") draw = False if draw: obj_detected_img = draw_bboxes( image_raw, bboxes=boxes, confidences=scores, categories=classes, all_categories=ALL_CATEGORIES) else: obj_detected_img = image_raw #now stream this image with lock: outputFrame = np.array(obj_detected_img) ctx.pop()
def train_net(config, private_config): # UNPACK CONFIGS (flag_para_load, flag_datalayer, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(config, ext_data=private_config['ext_data'], ext_label=private_config['ext_label']) gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_datalayer, flag_para_load, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: validation loss %f ' % (epoch, this_val_loss)) print('epoch %i: validation error %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'val_record.npy', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save']: if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def _test_pycuda(self): import pycuda.driver as drv import pycuda.tools import pycuda.autoinit import numpy import numpy.linalg as la from pycuda.compiler import SourceModule from jinja2 import Template import pycuda.gpuarray as gpuarray tpl2 = Template(""" extern __shared__ int smem[]; __global__ void square_them(int* a,int l) { const int i = blockDim.x*blockIdx.x +l*threadIdx.x; int* tmp=&smem[l*threadIdx.x]; for(int j=0;j<l;++j) tmp[j]=a[i+j]; __syncthreads(); for(int j=0;j<l;++j) tmp[j]*=2; __syncthreads(); for(int j=0;j<l;++j) a[i+j]=tmp[j]; } """) drv.init() dev = drv.Device(0) t = 9 nthr = 256 size_arrays = 1024 * 512 num_blocks = 64 l = int(size_arrays / num_blocks / nthr) print l a = range(0, l) a = a * (nthr * num_blocks) a = numpy.array(a, dtype=numpy.int32) a_gpu = gpuarray.to_gpu(a.astype(numpy.int32)) code = tpl2.render() #print code mod = SourceModule(code) square_them = mod.get_function("square_them") shmem = 4 * l * nthr print "sh mem usage: ", shmem if shmem > dev.MAX_SHARED_MEMORY_PER_BLOCK: print "too much shared memory used" #return print a[0:2 * l] square_them(a_gpu, numpy.int32(l), grid=(num_blocks, 1, 1), block=(nthr, 1, 1), shared=shmem) print a[0:2 * l]
def calculation(in_queue, out_queue): device_num, params = in_queue.get() chunk_size = params['chunk_size'] chunks_num = params['chunks_num'] particles = params['particles'] state = params['state'] representation = params['representation'] quantities = params['quantities'] decoherence = params['decoherence'] if decoherence is not None: decoherence_steps = decoherence['steps'] decoherence_coeff = decoherence['coeff'] else: decoherence_steps = 0 decoherence_coeff = 1 binning = params['binning'] if binning is not None: s = set() for names, _, _ in binning: s.update(names) quantities = sorted(list(s)) c_dtype = numpy.complex128 c_ctype = 'double2' s_dtype = numpy.float64 s_ctype = 'double' Fs = [] cuda.init() device = cuda.Device(device_num) ctx = device.make_context() free, total = cuda.mem_get_info() max_chunk_size = float(total) / len(quantities) / numpy.dtype( c_dtype).itemsize / 1.1 max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10)) #print free, total, max_chunk_size if max_chunk_size > chunk_size: subchunk_size = chunk_size subchunks_num = 1 else: assert chunk_size % max_chunk_size == 0 subchunk_size = max_chunk_size subchunks_num = chunk_size / subchunk_size buffers = [] for quantity in sorted(quantities): buffers.append(GPUArray(subchunk_size, c_dtype)) stream = cuda.Stream() # compile code try: source = TEMPLATE.render(c_ctype=c_ctype, s_ctype=s_ctype, particles=particles, state=state, representation=representation, quantities=quantities, decoherence_coeff=decoherence_coeff) except: print exceptions.text_error_template().render() raise try: module = SourceModule(source, no_extern_c=True) except: for i, l in enumerate(source.split("\n")): print i + 1, ":", l raise kernel_initialize = module.get_function("initialize") kernel_calculate = module.get_function("calculate") kernel_decoherence = module.get_function("decoherence") # prepare call parameters gen_block_size = min(kernel_initialize.max_threads_per_block, kernel_calculate.max_threads_per_block) gen_grid_size = device.get_attribute( cuda.device_attribute.MULTIPROCESSOR_COUNT) gen_block = (gen_block_size, 1, 1) gen_grid = (gen_grid_size, 1, 1) num_gen = gen_block_size * gen_grid_size assert num_gen <= 20000 # prepare RNG states #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32)) seeds = to_gpu( numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32)) state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>") states = cuda.mem_alloc(num_gen * state_type_size) #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k kernel_initialize(states, seeds.gpudata, block=gen_block, grid=gen_grid, stream=stream) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size) # run calculation args = [states] + [buf.gpudata for buf in buffers] + [numpy.int32(subchunk_size)] if binning is None: results = { quantity: numpy.zeros( (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype) for quantity in quantities } for i in xrange(chunks_num * subchunks_num): kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) for k in xrange(decoherence_steps + 1): if k > 0: kernel_decoherence(*args, block=gen_block, grid=gen_grid, stream=stream) for j, quantity in enumerate(sorted(quantities)): F = (gpuarray.sum(buffers[j], stream=stream) / buffers[j].size).get() results[quantity][k, i] = F for quantity in sorted(quantities): results[quantity] = results[quantity].reshape( decoherence_steps + 1, chunks_num, subchunks_num).mean(2).real.tolist() out_queue.put(results) else: bin_accums = [ numpy.zeros(tuple([binnum] * len(vals)), numpy.int64) for vals, binnum, _ in binning ] bin_edges = [None] * len(binning) for i in xrange(chunks_num * subchunks_num): bin_edges = [] kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) results = { quantity: buffers[j].get().real for j, quantity in enumerate(sorted(quantities)) } for binparam, bin_accum in zip(binning, bin_accums): qnames, binnum, ranges = binparam sample_lines = [results[quantity] for quantity in qnames] sample = numpy.concatenate( [arr.reshape(subchunk_size, 1) for arr in sample_lines], axis=1) hist, edges = numpy.histogramdd(sample, binnum, ranges) bin_accum += hist bin_edges.append(numpy.array(edges)) results = [[acc.tolist(), edges.tolist()] for acc, edges in zip(bin_accums, bin_edges)] out_queue.put(results) #ctx.pop() ctx.detach()
from xpra.codecs.image_wrapper import ImageWrapper from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs from xpra.log import Logger, debug_if_env log = Logger() debug = debug_if_env(log, "XPRA_CUDA_DEBUG") error = log.error import threading import os import numpy import time assert bytearray import pycuda #@UnresolvedImport from pycuda import driver #@UnresolvedImport from pycuda.compiler import compile #@UnresolvedImport driver.init() DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0")) COLORSPACES_MAP = { "BGRA" : ("YUV420P", "YUV422P", "YUV444P"), "BGRX" : ("YUV420P", "YUV422P", "YUV444P"), "RGBA" : ("YUV420P", "YUV422P", "YUV444P"), "RGBX" : ("YUV420P", "YUV422P", "YUV444P"), } KERNELS_MAP = {} def log_sys_info(): log.info("PyCUDA version=%s", ".".join([str(x) for x in driver.get_version()])) log.info("PyCUDA driver version=%s", driver.get_driver_version())
def __init__(self, img_size, **kwargs): cuda.init() from pycuda.tools import make_default_context global context context = make_default_context() unknown = [] for k in kwargs.keys(): if k not in [ 'verbose', 'levels', 'resampling_factor', 'kernel_file', 'iterations', 'show_diff', 'Nfields', 'img', 'fields', 'mask', 'mul' ]: unknown.append(k) if len(unknown) != 0: warnings.warn( "Unrecognized parameter" + ('s: ' + str(unknown) if len(unknown) > 1 else ': ' + unknown[0]), SyntaxWarning) self.verbose = kwargs.get("verbose", 0) self.debug( 3, "You set the verbose level to the maximum.\n\ It may help finding bugs or tracking errors but it may also \ impact the program performance as it will print A LOT of \ output and add GPU->CPU copies only to print information.\n\ If it is not desired, consider lowering the verbosity: \ 1 or 2 is a reasonable choice, \ 0 won't show anything except for errors.") self.levels = kwargs.get("levels", 5) self.loop = 0 self.resamplingFactor = kwargs.get("resampling_factor", 2) h, w = img_size self.nbIter = kwargs.get("iterations", 4) self.debug(1, "Initializing... Master resolution:", img_size, "levels:", self.levels, "verbosity:", self.verbose) # Computing dimensions of the different levels # self.h, self.w = [], [] for i in range(self.levels): self.h.append(int(round(h / (self.resamplingFactor**i)))) self.w.append(int(round(w / (self.resamplingFactor**i)))) if kwargs.get("Nfields") is not None: self.Nfields = kwargs.get("Nfields") else: try: self.Nfields = len(kwargs["fields"]) except KeyError: self.debug( 0, "Error! You must provide the number of fields at init. \ Add Nfields=x or directly set fields with fields=list/tuple") raise ValueError kernelFile = kwargs.get("kernel_file") if kernelFile is None: self.debug( 3, "Kernel file not specified, using the one in crappy dir") from crappy import __path__ as crappyPath kernelFile = crappyPath[0] + "/data/kernels.cu" self.debug(3, "Kernel file:", kernelFile) # Creating a new instance of CorrelStage for each stage # self.correl = [] for i in range(self.levels): self.correl.append( CorrelStage((self.h[i], self.w[i]), verbose=self.verbose, Nfields=self.Nfields, iterations=self.nbIter, show_diff=(i == 0 and kwargs.get("show_diff", False)), mul=kwargs.get("mul", 3), kernel_file=kernelFile)) # Set original image if provided # if kwargs.get("img") is not None: self.setOrig(kwargs.get("img")) s = """ texture<float, cudaTextureType2D, cudaReadModeElementType> texFx{0}; texture<float, cudaTextureType2D, cudaReadModeElementType> texFy{0}; __global__ void resample{0}(float* outX, float* outY, int x, int y) {{ int idx = blockIdx.x*blockDim.x+threadIdx.x; int idy = blockIdx.y*blockDim.y+threadIdx.y; if(idx < x && idy < y) {{ outX[idy*x+idx] = tex2D(texFx{0},(float)idx/x, (float)idy/y); outY[idy*x+idx] = tex2D(texFy{0},(float)idx/x, (float)idy/y); }} }} """ self.src = "" for i in range(self.Nfields): self.src += s.format( i) # Adding textures for the quick fields resampling self.mod = SourceModule(self.src) self.texFx = [] self.texFy = [] self.resampleF = [] for i in range(self.Nfields): self.texFx.append(self.mod.get_texref("texFx%d" % i)) self.texFy.append(self.mod.get_texref("texFy%d" % i)) self.resampleF.append(self.mod.get_function("resample%d" % i)) self.resampleF[i].prepare("PPii", texrefs=[self.texFx[i], self.texFy[i]]) for t in self.texFx + self.texFy: t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) t.set_filter_mode(cuda.filter_mode.LINEAR) t.set_address_mode(0, cuda.address_mode.BORDER) t.set_address_mode(1, cuda.address_mode.BORDER) # Set fields if provided # if kwargs.get("fields") is not None: self.setFields(kwargs.get("fields")) if kwargs.get("mask") is not None: self.setMask(kwargs.get("mask"))
def init_all_devices(): global DEVICES, DEVICE_INFO if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") DEVICES = [] DEVICE_INFO = {} try: driver.init() except Exception as e: log.error("Error: cannot initialize CUDA") log.error(" %s", e) return DEVICES log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() if ngpus == 0: log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT) return DEVICES cuda_device_blacklist = get_pref("blacklist") da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None devinfo = "gpu %i" % i try: device = driver.Device(i) devinfo = device_info(device) if cuda_device_blacklist: blacklisted = [ x for x in cuda_device_blacklist if x and devinfo.find(x) >= 0 ] log("blacklisted(%s / %s)=%s", devinfo, cuda_device_blacklist, blacklisted) if blacklisted: log.warn( "Warning: device '%s' is blacklisted and will not be used", devinfo) continue log(" + testing device %s: %s", i, devinfo) DEVICE_INFO[i] = devinfo host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", devinfo) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) try: log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free / 1024 / 1024), int(total / 1024 / 1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor << 4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) if i == 0: #we print the list info "header" from inside the loop #so that the log output is bunched up together log.info("CUDA %s / PyCUDA %s, found %s device%s:", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus)) if SMmajor >= 2: DEVICES.append(i) else: log.info(" this device is too old!") log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100 * free / total, SMmajor, SMminor) finally: context.pop() except Exception as e: log.error("error on device %s: %s", devinfo, e) return DEVICES
def solve_gpu(currentmodelrun, modelend, G): """Solving using FDTD method on GPU. Implemented using Nvidia CUDA. Args: currentmodelrun (int): Current model run number. modelend (int): Number of last model to run. G (class): Grid class instance - holds essential parameters describing the model. Returns: tsolve (float): Time taken to execute solving """ import pycuda.driver as drv from pycuda.compiler import SourceModule drv.init() # Suppress nvcc warnings on Windows if sys.platform == 'win32': compiler_opts = ['-w'] else: compiler_opts = None # Create device handle and context on specifc GPU device (and make it current context) dev = drv.Device(G.gpu.deviceID) ctx = dev.make_context() # Electric and magnetic field updates - prepare kernels, and get kernel functions if Material.maxpoles > 0: kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3]), options=compiler_opts) else: # Set to one any substitutions for dispersive materials kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1), options=compiler_opts) update_e_gpu = kernels_fields.get_function("update_e") update_h_gpu = kernels_fields.get_function("update_h") # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0] if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem: raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name)) else: drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU if Material.maxpoles > 0: # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values). update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A") update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B") G.gpu_initialise_dispersive_arrays() # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU G.gpu_set_blocks_per_grid() G.gpu_initialise_arrays() # PML updates if G.pmls: # Prepare kernels kernels_pml = SourceModule(kernels_template_pml.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_R=G.pmls[0].ERA.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Set block per grid, initialise arrays on GPU, and get kernel functions for pml in G.pmls: pml.gpu_set_blocks_per_grid(G) pml.gpu_initialise_arrays() pml.gpu_get_update_funcs(kernels_pml) # Receivers if G.rxs: # Initialise arrays on GPU rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G) # Prepare kernel and get kernel function kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]), options=compiler_opts) store_outputs_gpu = kernel_store_outputs.get_function("store_outputs") # Sources - initialise arrays on GPU, prepare kernel and get kernel functions if G.voltagesources + G.hertziandipoles + G.magneticdipoles: kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) if G.hertziandipoles: srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G) update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole") if G.magneticdipoles: srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G) update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole") if G.voltagesources: srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G) update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source") # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions if G.snapshots: # Initialise arrays on GPU snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G) # Prepare kernel and get kernel function kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]), options=compiler_opts) store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot") # Iteration loop timer iterstart = drv.Event() iterend = drv.Event() iterstart.record() for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=G.tqdmdisable): # Get GPU memory usage on final iteration if iteration == G.iterations - 1: memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0] # Store field component values for every receiver if G.rxs: store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration), rxcoords_gpu.gpudata, rxs_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1)) # Store any snapshots for i, snap in enumerate(G.snapshots): if snap.time == iteration + 1: store_snapshot_gpu(np.int32(i), np.int32(snap.xs), np.int32(snap.xf), np.int32(snap.ys), np.int32(snap.yf), np.int32(snap.zs), np.int32(snap.zf), np.int32(snap.dx), np.int32(snap.dy), np.int32(snap.dz), G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata, snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata, block=Snapshot.tpb, grid=Snapshot.bpg) if G.snapsgpu2cpu: gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) # Update magnetic field components update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update magnetic field components with the PML correction for pml in G.pmls: pml.gpu_update_magnetic(G) # Update magnetic field components for magetic dipole sources if G.magneticdipoles: update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata, srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1)) # Update electric field components # If all materials are non-dispersive do standard update if Material.maxpoles == 0: update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # If there are any dispersive materials do 1st part of dispersive update # (it is split into two parts as it requires present and updated electric field values). else: update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update electric field components with the PML correction for pml in G.pmls: pml.gpu_update_electric(G) # Update electric field components for voltage sources if G.voltagesources: update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata, srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1)) # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last) if G.hertziandipoles: update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata, srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1)) # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates. if Material.maxpoles > 0: update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Copy output from receivers array back to correct receiver objects if G.rxs: gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G) # Copy data from any snapshots back to correct snapshot objects if G.snapshots and not G.snapsgpu2cpu: for i, snap in enumerate(G.snapshots): gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) iterend.record() iterend.synchronize() tsolve = iterstart.time_till(iterend) * 1e-3 # Remove context from top of stack and delete ctx.pop() del ctx return tsolve, memsolve
def MetropolisCuda(InputCU): print("Inside ",InputCU) iterations=InputCU['Iterations'] steps=InputCU['Steps'] blocks=InputCU['Blocks'] threads=InputCU['Threads'] Device=InputCU['Device'] RNG=InputCU['RNG'] ValueType=InputCU['ValueType'] TestType=InputCU['IfThen'] Marsaglia,Computing,Test=DictionariesAPI() try: # For PyCUDA import import pycuda.driver as cuda from pycuda.compiler import SourceModule cuda.init() for Id in range(cuda.Device.count()): if Id==Device: XPU=cuda.Device(Id) print("GPU selected %s" % XPU.name()) print except ImportError: print("Platform does not seem to support CUDA") circle=numpy.zeros(blocks*threads).astype(numpy.uint64) circleCU = cuda.InOut(circle) #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize) #cuda.memcpy_htod(circleCU, circle) Context=XPU.make_context() try: mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])]) #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True) # Needed to set the compiler via ccbin for CUDA9 implementation #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True) except: print("Compilation seems to break") MetropolisBlocksCU=mod.get_function("MainLoopBlocks") MetropolisThreadsCU=mod.get_function("MainLoopThreads") MetropolisHybridCU=mod.get_function("MainLoopHybrid") MyDuration=numpy.zeros(steps) jobs=blocks*threads; iterationsCU=numpy.uint64(iterations/jobs) if iterations%jobs!=0: iterationsCU+=numpy.uint64(1) for i in range(steps): start_time=time.time() try: MetropolisHybridCU(circleCU, numpy.uint64(iterationsCU), numpy.uint32(110271), numpy.uint32(101008), # numpy.uint32(nprnd(2**32)), # numpy.uint32(nprnd(2**32)), grid=(blocks,1),block=(threads,1,1)) except: print("Crash during CUDA call") elapsed = time.time()-start_time print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed)) MyDuration[i]=elapsed OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration} print(OutputCU) Context.pop() Context.detach() return(OutputCU)
def _diffusion_child(comm, bm=None): rank = comm.Get_rank() ngpus = comm.Get_size() nodename = socket.gethostname() name = '%s %s' %(nodename, rank) print(name) if rank == 0: # split indices on GPUs indices_split = _split_indices(bm.indices, ngpus) print('Indices:', indices_split) # send data to GPUs for k in range(1, ngpus): sendToChild(comm, bm.indices, indices_split[k], k, bm.data, bm.labels, bm.label.nbrw, bm.label.sorw, bm.label.allaxis) # init cuda device cuda.init() dev = cuda.Device(rank) ctx = dev.make_context() # select the desired script if bm.label.allaxis: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0], bm.label.nbrw, bm.label.sorw, name) tac = time.time() print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds') # gather data zsh_tmp = bm.argmax_z - bm.argmin_z ysh_tmp = bm.argmax_y - bm.argmin_y xsh_tmp = bm.argmax_x - bm.argmin_x if ngpus > 1: final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) for k in range(bm.nol): sendbuf = np.copy(walkmap[k]) recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) comm.Barrier() comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT], root=0, op=MPI.SUM) final_zero[k] = recvbuf else: final_zero = walkmap # block and grid size block = (32, 32, 1) x_grid = (xsh_tmp // 32) + 1 y_grid = (ysh_tmp // 32) + 1 grid = (int(x_grid), int(y_grid), int(zsh_tmp)) xsh_gpu = np.int32(xsh_tmp) ysh_gpu = np.int32(ysh_tmp) # smooth if bm.label.smooth: try: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) except Exception as e: print('Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.') bm.label.smooth = 0 if bm.label.smooth: final_smooth = np.copy(final_zero) for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_smooth[k]) for l in range(bm.label.smooth): curvature_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) update_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) final_smooth[k] = a_gpu.get() final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8) final_smooth = get_labels(final_smooth, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth final = final[1:-1, 1:-1, 1:-1] bm.path_to_smooth = unique_file_path(bm.path_to_smooth, bm.image.user.username) save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type, bm.label.compression) # uncertainty if bm.label.uncertainty: try: max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) kernel_uncertainty = _build_kernel_uncertainty() kernel_max = _build_kernel_max() for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_zero[k]) kernel_max(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) kernel_uncertainty(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) uq = a_gpu.get() uq *= 255 uq = uq.astype(np.uint8) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = uq final = final[1:-1, 1:-1, 1:-1] bm.path_to_uq = unique_file_path(bm.path_to_uq, bm.image.user.username) save_data(bm.path_to_uq, final, compress=bm.label.compression) except Exception as e: print('Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.') bm.label.uncertainty = False # free device ctx.pop() del ctx # argmax final_zero = np.argmax(final_zero, axis=0).astype(np.uint8) # save finals final_zero = get_labels(final_zero, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_zero final = final[1:-1, 1:-1, 1:-1] bm.path_to_final = unique_file_path(bm.path_to_final, bm.image.user.username) save_data(bm.path_to_final, final, bm.header, bm.final_image_type, bm.label.compression) # create final objects shortfilename = os.path.basename(bm.path_to_final) filename = 'images/' + bm.image.user.username + '/' + shortfilename tmp = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=1, active=1, imageType=3, shortfilename=shortfilename) tmp.friend = tmp.id tmp.save() if bm.label.uncertainty: shortfilename = os.path.basename(bm.path_to_uq) filename = 'images/' + bm.image.user.username + '/' + shortfilename Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=4, imageType=3, shortfilename=shortfilename, friend=tmp.id) if bm.label.smooth: shortfilename = os.path.basename(bm.path_to_smooth) filename = 'images/' + bm.image.user.username + '/' + shortfilename smooth = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=5, imageType=3, shortfilename=shortfilename, friend=tmp.id) # write in logs t = int(time.time() - bm.TIC) if t < 60: time_str = str(t) + ' sec' elif 60 <= t < 3600: time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec' elif 3600 < t: time_str = str(t // 3600) + ' h ' + str((t % 3600) // 60) + ' min ' + str(t % 60) + ' sec' with open(bm.path_to_time, 'a') as timefile: print('%s %s %s %s MB %s on %s' %(time.ctime(), bm.image.user.username, bm.image.shortfilename, bm.imageSize, time_str, config['SERVER_ALIAS']), file=timefile) print('Total calculation time:', time_str) # send notification send_notification(bm.image.user.username, bm.image.shortfilename, time_str, config['SERVER_ALIAS']) # start subprocesses if config['OS'] == 'linux': # acwe q = Queue('acwe', connection=Redis()) job = q.enqueue_call(active_contour, args=(bm.image.id, tmp.id, bm.label.id,), timeout=-1) # cleanup q = Queue('cleanup', connection=Redis()) job = q.enqueue_call(remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id,), timeout=-1) if bm.label.smooth: job = q.enqueue_call(remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False,), timeout=-1) # create slices q = Queue('slices', connection=Redis()) job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_final,), timeout=-1) if bm.label.smooth: job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_smooth,), timeout=-1) if bm.label.uncertainty: job = q.enqueue_call(create_slices, args=(bm.path_to_uq, None,), timeout=-1) elif config['OS'] == 'windows': # acwe Process(target=active_contour, args=(bm.image.id, tmp.id, bm.label.id)).start() # cleanup Process(target=remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id)).start() if bm.label.smooth: Process(target=remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False)).start() # create slices Process(target=create_slices, args=(bm.path_to_data, bm.path_to_final)).start() if bm.label.smooth: Process(target=create_slices, args=(bm.path_to_data, bm.path_to_smooth)).start() if bm.label.uncertainty: Process(target=create_slices, args=(bm.path_to_uq, None)).start() else: data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0) data = np.empty((data_z, data_y, data_x), dtype=data_dtype) if data_dtype == 'uint8': comm.Recv([data, MPI.BYTE], source=0, tag=1) else: comm.Recv([data, MPI.FLOAT], source=0, tag=1) allx, nbrw, sorw = comm.recv(source=0, tag=2) if allx: labels = [] for k in range(3): labels_z, labels_y, labels_x = comm.recv(source=0, tag=k+3) labels_tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels_tmp, MPI.INT], source=0, tag=k+6) labels.append(labels_tmp) else: labels_z, labels_y, labels_x = comm.recv(source=0, tag=3) labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels, MPI.INT], source=0, tag=6) indices = comm.recv(source=0, tag=9) indices_child = comm.recv(source=0, tag=10) # init cuda device cuda.init() dev = cuda.Device(rank) ctx = dev.make_context() # select the desired script if allx: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name) tac = time.time() print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds') # free device ctx.pop() del ctx # send data for k in range(walkmap.shape[0]): datatemporaer = np.copy(walkmap[k]) comm.Barrier() comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)
""" CUDA module; all cuda subclasses go here that is, context, kernel, and codegen """ import numpy as np from pycuda.driver import init, Device init() from pycuda import gpuarray from pycuda.compiler import SourceModule from ..context import AbstractContext import threadpy.backend.CUDA class Context(AbstractContext, threadpy.backend.CUDA.Context): """CUDA context wrapper""" def __init__(self, device = 0, context = None): #init backend threadpy.backend.CUDA.Context.__init__(self, device, context) #init threadweave specific features AbstractContext.__init__(self) #device property accessors. only used by threadweave at the moment #but they are probably better abstracted away in threadpy #note that these are far from complete def supported_axes(self):
def train_net(config): # UNPACK CONFIGS (flag_para_load, flag_datalayer, train_filenames, val_filenames, train_labels, val_labels, img_mean) = unpack_configs(config) if flag_para_load: # pycuda and zmq set up drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij if config['print_train_error']: print 'training error rate:', train_error() if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_validation_error, this_validation_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_datalayer, flag_para_load, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) print('epoch %i: validation loss %f ' % (epoch, this_validation_loss)) print('epoch %i: validation error %f %%' % (epoch, this_validation_error * 100.)) val_record.append([this_validation_error, this_validation_loss]) np.save(config['weights_dir'] + 'val_record.npy', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save weights if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def cal_field(pnts, gt_pnts, gpu=0): # print("gpu",gpu) # print("CUDA_VISIBLE_DEVICES",os.environ["CUDA_VISIBLE_DEVICES"]) # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"]=str(gpu) if gpu < 0: import pycuda.autoinit else: drv.init() dev1 = drv.Device(gpu) ctx1 = dev1.make_context() mod = SourceModule(""" __device__ float compute_force_scalar(float dist) { // float dist_expand = dist*100; // return 1/(dist_expand*dist_expand*dist_expand*dist_expand*dist_expand*dist_expand*dist_expand*dist_expand+1E-6); // float dist_expand = dist*1000; // return 1/(dist_expand*dist_expand*dist_expand*dist_expand+1E-12); float dist_expand = dist*1000; return 1/(dist_expand*dist_expand*dist_expand*dist_expand+1E-14); } __global__ void p2g(float *gvfs, float *pnts, float *gt_pnts, int pnt_num, int gt_num) { int p_id = blockIdx.x * blockDim.x + threadIdx.x; float px = pnts[p_id*3]; float py = pnts[p_id*3+1]; float pz = pnts[p_id*3+2]; float force, force_sum=0, x_sum=0, y_sum=0, z_sum=0; float dist, x_dist, y_dist, z_dist; for (int gt_id=0; gt_id<gt_num; gt_id++){ x_dist = gt_pnts[gt_id*3] - px; y_dist = gt_pnts[gt_id*3+1] - py; z_dist = gt_pnts[gt_id*3+2] - pz; dist = sqrt(x_dist*x_dist + y_dist*y_dist + z_dist*z_dist); force = compute_force_scalar(dist); force_sum = force_sum + force; x_sum = x_sum + x_dist * force; y_sum = y_sum + y_dist * force; z_sum = z_sum + z_dist * force; } //printf("%f ",y_sum); gvfs[p_id*3] = x_sum / force_sum; gvfs[p_id*3+1] = y_sum / force_sum; gvfs[p_id*3+2] = z_sum / force_sum; } """) kMaxThreadsPerBlock = 1024 pnt_num = pnts.shape[0] gt_num = gt_pnts.shape[0] # print("start to cal gvf gt field pnt num: ", gt_num) gvfs = np.zeros((pnt_num, 3)).astype(np.float32) gridSize = int((pnt_num + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock) pnts_tries_ivt = mod.get_function("p2g") pnts_tries_ivt(drv.Out(gvfs), drv.In(np.float32(pnts)), drv.In(np.float32(gt_pnts)), np.int32(pnt_num), np.int32(gt_num), block=(kMaxThreadsPerBlock,1,1), grid=(gridSize,1)) # print("ivt[0,0,:]", ivt[0,0,:]) if gpu >= 0: ctx1.pop() return gvfs