def __call__(self, buffer_alloc): """ Allocates the GPUTensor object as a view of a pre-allocated buffer. Arguments: buffer_alloc (DeviceAllocation): Memory handle returned by pycuda allocator """ tensor_description = self.tensor_description layout = tensor_description.layout dtype = self.transformer.storage_dtype(tensor_description.dtype) if layout: gpudata = int(buffer_alloc) + (layout.offset * dtype.itemsize) strides = tuple([s * dtype.itemsize for s in layout.strides]) new_tensor = GPUArray(layout.shape, dtype, gpudata=gpudata, strides=strides) else: gpudata = int(buffer_alloc) + tensor_description.offset new_tensor = GPUArray(tensor_description.shape, dtype, gpudata=gpudata, strides=tensor_description.strides) self._tensor = new_tensor self.transformer.tensors[self.tensor_name] = self._tensor
def __setitem__(self, key, value): sliced = self.__getitem__(key) # Use fill for scalar values if type(value) == np.float32 or type(value) == np.float64 or \ type(value) == float: sliced.fill(value) elif type(value) == np.int32 or type(value) == np.int64 or \ type(value) == int: sliced.fill(value) elif self.tensor.shape == () or np.prod(self.tensor.shape) == 1: sliced.fill(value) elif np.sum(self.tensor.strides) == 0: view = GPUArray((1, ), dtype=self.tensor.dtype) view.fill(value) else: # Convert to correct dtype if necessary if value.dtype != self.tensor.dtype: new_value = np.ndarray(self.tensor.shape, dtype=self.tensor.dtype) new_value[:] = value value = new_value # Reshape to satisfy pycuda if necessary if sliced.shape != value.shape: sliced = self.tensor.reshape(value.shape) if self.is_contiguous and self.strides_contiguous(value): sliced[:] = value elif type(value) == GPUArray: self.from_other(value, sliced) else: contig_tensor = GPUArray(value.shape, self.tensor.dtype) contig_tensor[:] = value self.from_other(contig_tensor, sliced)
def errest(self, x, y, z, *, norm): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') # Wrap xarr = GPUArray(x.leaddim*x.nrow, x.dtype, gpudata=x) yarr = GPUArray(y.leaddim*y.nrow, y.dtype, gpudata=y) zarr = GPUArray(z.leaddim*z.nrow, z.dtype, gpudata=z) # Norm type reduce_expr = 'a + b' if norm == 'l2' else 'max(a, b)' # Build the reduction kernel rkern = ReductionKernel( x.dtype, neutral='0', reduce_expr=reduce_expr, map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)', arguments='{0}* x, {0}* y, {0}* z, {0} atol, {0} rtol' .format(npdtype_to_ctype(x.dtype)) ) class ErrestKernel(ComputeKernel): @property def retval(self): return self._retarr.get() def run(self, queue, atol, rtol): self._retarr = rkern(xarr, yarr, zarr, atol, rtol, stream=queue.cuda_stream_comp) return ErrestKernel()
def __setitem__(self, key, value): sliced = self.__getitem__(key) # Use fill for scalar values # convert value to numpy if type(value) == float: value = np.float64(value) elif type(value) == int: value = np.int64(value) elif isinstance(value, np.ndarray): # handle 0-d and 1-d conversion to scalar if value.shape == (): value = value[()] elif value.shape == (1, ): value = value[0] # flex: added astype to deal with GPUArray dtype int16 # FLEX TODO: assumed same behavior for all cases if type(value) in (np.int32, np.int64, int, np.uint32, np.float32, np.float64): sliced.fill(value.astype(sliced.dtype)) elif self.tensor.shape == () or np.prod(self.tensor.shape) == 1: sliced.fill(value.astype(sliced.dtype)) elif np.sum(self.tensor.strides) == 0: view = GPUArray((1, ), dtype=self.tensor.dtype) view.fill(value.astype(sliced.dtype)) else: # Convert to correct dtype if necessary if value.dtype != self.tensor.dtype: new_value = np.ndarray(value.shape, dtype=self.tensor.dtype) new_value[:] = value value = new_value # Reshape to satisfy pycuda if necessary if sliced.shape != value.shape: sliced = self.tensor.reshape(value.shape) if self.is_contiguous and self.strides_contiguous(value): if sliced.shape == (): sliced.reshape((1, ))[:] = value.reshape((1, )) else: sliced[:] = value elif type(value) == GPUArray: self.from_other(value, sliced) else: contig_tensor = GPUArray(value.shape, self.tensor.dtype) contig_tensor[:] = value self.from_other(contig_tensor, sliced)
def __getitem__(self, index): if index is None or index == _none_slice or index == (): return self.tensor elif not isinstance(index, tuple): index = (index,) # Slice tensor by changing shape, strides, and base address new_shape = [] new_offset = 0 new_strides = [] seen_ellipsis = False shape = self.tensor.shape dtype = self.tensor.dtype strides = self.tensor.strides # Iterate over axes of index to compute new offset, shape, strides array_axis = 0 for index_axis in range(len(index)): index_entry = index[index_axis] if array_axis > len(shape): raise IndexError("Too many axes in index") if isinstance(index_entry, slice): # Standard slicing (start:stop:step) start, stop, idx_strides = index_entry.indices(shape[array_axis]) new_offset += (start * strides[array_axis]) new_shape.append(-((start - stop) // idx_strides)) new_strides.append(idx_strides * strides[array_axis]) array_axis += 1 elif isinstance(index_entry, (int, np.integer)): # Single index value new_offset += (index_entry * strides[array_axis]) array_axis += 1 elif index_entry is Ellipsis: # Use same shape as original for these axes if seen_ellipsis: raise IndexError( "More than one ellipsis not allowed in index") seen_ellipsis = True remaining_index_count = len(index) - (index_axis + 1) new_array_axis = len(shape) - remaining_index_count if new_array_axis < array_axis: raise IndexError("Invalid use of ellipsis in index") while array_axis < new_array_axis: new_shape.append(shape[array_axis]) new_strides.append(strides[array_axis]) array_axis += 1 else: raise IndexError("Invalid subindex %s in axis %d" % (index_entry, index_axis)) # Create view return GPUArray(new_shape, dtype, strides=new_strides, gpudata=(self.tensor.gpudata + new_offset))
def get(self, tensor): """ Copy the device tensor to a numpy array. Arguments: tensor (np.ndarray): Optional output array Returns: Numpy array containing tensor data """ if np.sum(self.tensor.strides) != 0: if self.is_contiguous or self.tensor.shape == () or np.prod(self.tensor.shape) == 1: contig_tensor = self.tensor else: # Need to do memcpy from contiguous device memory contig_tensor = self.as_contiguous() if tensor is None: return contig_tensor.get() tensor[:] = contig_tensor.get() else: # Tensor is just a broadcasted scalar, get scalar value and fill output array view = GPUArray((1, ), dtype=self.tensor.dtype, gpudata=self.tensor.gpudata)[0] value = view.get() if tensor is None: out = np.ndarray(self.tensor.shape, dtype=self.tensor.dtype) out.fill(value) return out tensor.fill(value) return tensor
def consume(self, buf_index, hostlist, devlist): assert 0 <= buf_index < 2, 'Can only double buffer' self.ctx.push() hbuf = hostlist[buf_index] if devlist[buf_index] is None: shape, dtype = hbuf.shape[::-1], hbuf.dtype devlist[buf_index] = GPUArray(shape, dtype) devlist[buf_index].set(hbuf.T) self.ctx.pop()
def rand(shape, dtype=numpy.float32, stream=None): from pycuda.gpuarray import GPUArray from pycuda.elementwise import get_elwise_kernel result = GPUArray(shape, dtype) if dtype == numpy.float32: func = get_elwise_kernel( "float *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0f) dest[i] = a*POW_2_M32; if ((i += total_threads) < n) dest[i] = b*POW_2_M32; if ((i += total_threads) < n) dest[i] = c*POW_2_M32; if ((i += total_threads) < n) dest[i] = d*POW_2_M32; """, "md5_rng_float") elif dtype == numpy.float64: func = get_elwise_kernel( "double *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0) #define POW_2_M64 (1/18446744073709551616.) dest[i] = a*POW_2_M32 + b*POW_2_M64; if ((i += total_threads) < n) { dest[i] = c*POW_2_M32 + d*POW_2_M64; } """, "md5_rng_float") elif dtype in [numpy.int32, numpy.uint32]: func = get_elwise_kernel( "unsigned int *dest, unsigned int seed", md5_code + """ dest[i] = a; if ((i += total_threads) < n) dest[i] = b; if ((i += total_threads) < n) dest[i] = c; if ((i += total_threads) < n) dest[i] = d; """, "md5_rng_int") else: raise NotImplementedError; func.set_block_shape(*result._block) func.prepared_async_call(result._grid, stream, result.gpudata, numpy.random.randint(2**31-1), result.size) return result
def toGpuArray(f): """Converts a waLBerla GPUField to a pycuda GPUArray""" if not f: return None dtype = np.dtype(f.dtypeStr) strides = [dtype.itemsize * a for a in f.strides] return GPUArray(f.sizeWithGhostLayers, dtype, gpudata=f.ptr, strides=strides)
def arrayp2g(pary): """convert a PitchArray to a GPUArray""" from pycuda.gpuarray import GPUArray result = GPUArray(pary.shape, pary.dtype) if pary.size: if pary.M == 1: cuda.memcpy_dtod(result.gpudata, pary.gpudata, pary.mem_size * pary.dtype.itemsize) else: PitchTrans(pary.shape, result.gpudata, _pd(result.shape), pary.gpudata, pary.ld, pary.dtype) return result
def dot(x, y): if not CUBLAS_ENABLED: return gpuarray.to_gpu(np.dot(x.get(), y.get())) if isinstance(x, GPUArray): result = GPUArray((y.shape[1], x.shape[0]), dtype=x.dtype) #util.log_info('%s %s %s', x.shape, y.shape, result.shape) #util.log_info('%s %s %s', x.ptr, y.ptr, result.ptr) sgemm('t', 't', x.shape[0], y.shape[1], x.shape[1], 1.0, x.gpudata, x.shape[1], y.gpudata, y.shape[1], 0.0, result.gpudata, result.shape[1]) result = transpose(result) return result else: return np.dot(x, y)
def as_contiguous(self): """ Creates a new GPUArray with the same dimensions, but using contiguous memory Returns: New contiguous GPUArray with separate underlying device allocation """ contig_tensor = GPUArray(self.tensor.shape, dtype=self.tensor.dtype) src_strides = [s // self.tensor.dtype.itemsize for s in self.tensor.strides] dst_strides = [s // contig_tensor.dtype.itemsize for s in contig_tensor.strides] kernel = _get_copy_transpose_kernel(self.tensor.dtype, self.tensor.shape, range(len(self.tensor.shape))) params = [contig_tensor.gpudata, self.tensor.gpudata] + list(kernel.args) params = params + src_strides + dst_strides kernel.prepared_async_call(kernel.grid, kernel.block, None, *params) return contig_tensor
def __call__(self, buffer_alloc): """ Allocates the GPUTensor object as a view of a pre-allocated buffer. Arguments: buffer_alloc (DeviceAllocation): Memory handle returned by pycuda allocator """ tensor_description = self.tensor_description gpudata = int(buffer_alloc) + tensor_description.offset new_tensor = GPUArray(tensor_description.shape, tensor_description.dtype, gpudata=gpudata, strides=tensor_description.strides) self._tensor = new_tensor self.transformer.tensors[self.tensor_name] = self._tensor
def toGpuArray(f, withGhostLayers=True): """Converts a waLBerla GPUField to a pycuda GPUArray""" if not f: return None dtype = np.dtype(f.dtypeStr) strides = [dtype.itemsize * a for a in f.strides] res = GPUArray(f.sizeWithGhostLayers, dtype, gpudata=f.ptr, strides=strides) if withGhostLayers is True: return res ghostLayers = normalizeGhostlayerInfo(f, withGhostLayers) glCutoff = [f.nrOfGhostLayers - gl for gl in ghostLayers] res = res[glCutoff[0]:-glCutoff[0] if glCutoff[0] > 0 else None, glCutoff[1]:-glCutoff[1] if glCutoff[1] > 0 else None, glCutoff[2]:-glCutoff[2] if glCutoff[2] > 0 else None, :] return res
def consume(self, buf_index, hostlist, devlist): assert 0 <= buf_index < 2, 'Can only double buffer' hbuf = hostlist[buf_index] frag_sz, ndims, ndtype = hbuf.shape[0] // self.num_dev, hbuf.shape[ 1], hbuf.dtype # Create fragment array destination if missing if devlist[buf_index] is None: devlist[buf_index] = [] for ctx in self.ctxs: ctx.push() devlist[buf_index].append(GPUArray((ndims, frag_sz), ndtype)) ctx.pop() # Initiate the transfer for idx, ctx, dbuf, strm in zip(self.device_ids, self.ctxs, devlist[buf_index], self.streams): ctx.push() dbuf.set_async(hbuf[idx * frag_sz:(idx + 1) * frag_sz, :].T, strm) ctx.pop()
def dot(x, y): timer.start() if isinstance(x, GPUArray): assert isinstance(y, GPUArray) if x.shape == (1, ): assert y.shape[0] == 1 y *= scalar(x) return y.ravel() elif y.shape == (1, ): assert x.shape[1] == 1 x *= scalar(y) return x.ravel() elif len(x.shape) == 1 and len(y.shape) == 1: return scalar(pycuda.gpuarray.dot(x, y)) else: needs_ravel = False if len(x.shape) == 1: needs_ravel = True x = x.reshape((1, ) + x.shape) if len(y.shape) == 1: needs_ravel = True y = y.reshape(y.shape + (1, )) #result = linalg.dot(x, y) result = GPUArray((y.shape[1], x.shape[0]), dtype=x.dtype) sgemm('t', 't', x.shape[0], y.shape[1], x.shape[1], 1.0, x.gpudata, x.shape[1], y.gpudata, y.shape[1], 0.0, result.gpudata, result.shape[1]) result = transpose(result) if needs_ravel: assert result.shape[1] == 1 or result.shape[0] == 1 result = result.ravel() timer.end('dot') return result else: return np.dot(x, y)
# Generate coordinates of non-uniform points. kx = np.random.uniform(-np.pi, np.pi, size=M) ky = np.random.uniform(-np.pi, np.pi, size=M) # Generate source strengths. c = (np.random.standard_normal((n_transf, M)) + 1j * np.random.standard_normal( (n_transf, M))) # Cast to desired datatype. kx = kx.astype(dtype) ky = ky.astype(dtype) c = c.astype(complex_dtype) # Allocate memory for the uniform grid on the GPU. fk_gpu = GPUArray((n_transf, N1, N2), dtype=complex_dtype) # Initialize the plan and set the points. plan = cufinufft(1, (N1, N2), n_transf, eps=eps, dtype=dtype) plan.set_pts(to_gpu(kx), to_gpu(ky)) # Execute the plan, reading from the strengths array c and storing the # result in fk_gpu. plan.execute(to_gpu(c), fk_gpu) # Retreive the result from the GPU. fk = fk_gpu.get() # Check accuracy of the transform at position (nt1, nt2). nt1 = int(0.37 * N1) nt2 = int(0.26 * N2)
# Generate coordinates of non-uniform points. kx = np.random.uniform(-np.pi, np.pi, size=M) ky = np.random.uniform(-np.pi, np.pi, size=M) # Generate grid values. fk = (np.random.standard_normal((n_transf, N1, N2)) + 1j * np.random.standard_normal((n_transf, N1, N2))) # Cast to desired datatype. kx = kx.astype(dtype) ky = ky.astype(dtype) fk = fk.astype(complex_dtype) # Allocate memory for the nonuniform coefficients on the GPU. c_gpu = GPUArray((n_transf, M), dtype=complex_dtype) # Initialize the plan and set the points. plan = cufinufft(2, (N1, N2), n_transf, eps=eps, dtype=dtype) plan.set_pts(to_gpu(kx), to_gpu(ky)) # Execute the plan, reading from the uniform grid fk c and storing the result # in c_gpu. plan.execute(c_gpu, to_gpu(fk)) # Retreive the result from the GPU. c = c_gpu.get() # Check accuracy of the transform at index jt. jt = M // 2
def zeros(length, dtype=np.float64): result = GPUArray(length, dtype=dtype) nwords = result.nbytes / 4 pycuda.driver.memset_d32(result.gpudata, 0, nwords) return result
def test_project_shepp_logan(with_spline): from pycuda.gpuarray import to_gpu, GPUArray from sympy.matrices.dense import MutableDenseMatrix MutableDenseMatrix.__hash__ = lambda x: 1 # hash(tuple(x)) try: import pyconrad.autoinit phantom3d = pyconrad.phantoms.shepp_logan(100, 100, 100) pyconrad.imshow(phantom3d, 'phantom') except Exception: phantom3d = np.random.rand(30, 31, 32) for i, projection_matrix in enumerate((m1, )): volume = pystencils.fields('volume: float32[100,100,100]') projections = pystencils.fields('projections: float32[1024,960]') volume.set_coordinate_origin_to_field_center() volume.coordinate_transform = sympy.rot_axis2(0.2) # volume.coordinate_transform = sympy.rot_axis3(0.1) volume.coordinate_transform = 3 * volume.coordinate_transform projections.set_coordinate_origin_to_field_center() kernel = forward_projection(volume, projections, projection_matrix, step_size=1, cubic_bspline_interpolation=with_spline) print(kernel) kernel = kernel.compile('gpu') # print(kernel.code) volume_gpu = to_gpu(np.ascontiguousarray(phantom3d, np.float32)) if with_spline: pystencils.gpucuda.texture_utils.prefilter_for_cubic_bspline( volume_gpu) projection_gpu = GPUArray(projections.spatial_shape, np.float32) kernel(volume=volume_gpu, projections=projection_gpu) pyconrad.imshow(volume_gpu, 'volume ' + str(with_spline)) pyconrad.imshow(projection_gpu, 'projections ' + str(i) + str(with_spline)) for i, projection_matrix in enumerate((m1, )): angle = pystencils_reco.typed_symbols('angle', 'float32') volume = pystencils.fields('volume: float32[100,100,100]') projections = pystencils.fields('projections: float32[1024,960]') volume.set_coordinate_origin_to_field_center() volume.coordinate_transform = sympy.rot_axis2(angle) # volume.coordinate_transform = sympy.rot_axis3(0.1) volume.coordinate_transform = 3 * volume.coordinate_transform projections.set_coordinate_origin_to_field_center() kernel = forward_projection(volume, projections, projection_matrix, step_size=1, cubic_bspline_interpolation=with_spline) print(kernel) kernel = kernel.compile('gpu') # print(kernel.code) volume_gpu = to_gpu(np.ascontiguousarray(phantom3d, np.float32)) if with_spline: pystencils.gpucuda.texture_utils.prefilter_for_cubic_bspline( volume_gpu) projection_gpu = GPUArray(projections.spatial_shape, np.float32) for phi in np.arange(0, np.pi, np.pi / 100): kernel(volume=volume_gpu, projections=projection_gpu, angle=phi) pyconrad.imshow(projection_gpu, 'rotation!' + str(with_spline)) pyconrad.close_all_windows()
print(arg_dict) formula = arg_dict.pop('formula', 'CH3COOH') zoom = arg_dict.pop('zoom', 1.5) repeat = arg_dict.pop('repeat', 4) nlaunch = arg_dict.pop('nlaunch', 1) block_per_sm = arg_dict.pop('block_per_sm', 8) block_size = arg_dict.pop('block_size', 128) device = pycuda.driver.Device(arg_dict.pop('device', 0)) njobs = device.MULTIPROCESSOR_COUNT * block_per_sm * repeat g = Graph.from_ase(molecule(formula), adjacency=dict(h=zoom)) kernel = Kernel() ''' generate jobs ''' jobs = [Job(0, 0, GPUArray(len(g.nodes)**2, np.float32)) for i in range(njobs)] ''' call GPU kernel ''' for i in range(nlaunch): kernel.kernel._launch_kernel([g], jobs, nodal=False, lmin=0) R = jobs[0].vr_gpu.get().reshape(len(g.nodes), -1) r = R.sum() print('Nodal similarity:\n', R, sep='') print('Overall similarity:\n', r, sep='') for job in jobs: assert (np.abs(job.vr_gpu.get().sum() - r) < r * 1e-6) print('**ALL PASSED**')
def calculation(in_queue, out_queue): device_num, params = in_queue.get() chunk_size = params['chunk_size'] chunks_num = params['chunks_num'] particles = params['particles'] state = params['state'] representation = params['representation'] quantities = params['quantities'] decoherence = params['decoherence'] if decoherence is not None: decoherence_steps = decoherence['steps'] decoherence_coeff = decoherence['coeff'] else: decoherence_steps = 0 decoherence_coeff = 1 binning = params['binning'] if binning is not None: s = set() for names, _, _ in binning: s.update(names) quantities = sorted(list(s)) c_dtype = numpy.complex128 c_ctype = 'double2' s_dtype = numpy.float64 s_ctype = 'double' Fs = [] cuda.init() device = cuda.Device(device_num) ctx = device.make_context() free, total = cuda.mem_get_info() max_chunk_size = float(total) / len(quantities) / numpy.dtype( c_dtype).itemsize / 1.1 max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10)) #print free, total, max_chunk_size if max_chunk_size > chunk_size: subchunk_size = chunk_size subchunks_num = 1 else: assert chunk_size % max_chunk_size == 0 subchunk_size = max_chunk_size subchunks_num = chunk_size / subchunk_size buffers = [] for quantity in sorted(quantities): buffers.append(GPUArray(subchunk_size, c_dtype)) stream = cuda.Stream() # compile code try: source = TEMPLATE.render(c_ctype=c_ctype, s_ctype=s_ctype, particles=particles, state=state, representation=representation, quantities=quantities, decoherence_coeff=decoherence_coeff) except: print exceptions.text_error_template().render() raise try: module = SourceModule(source, no_extern_c=True) except: for i, l in enumerate(source.split("\n")): print i + 1, ":", l raise kernel_initialize = module.get_function("initialize") kernel_calculate = module.get_function("calculate") kernel_decoherence = module.get_function("decoherence") # prepare call parameters gen_block_size = min(kernel_initialize.max_threads_per_block, kernel_calculate.max_threads_per_block) gen_grid_size = device.get_attribute( cuda.device_attribute.MULTIPROCESSOR_COUNT) gen_block = (gen_block_size, 1, 1) gen_grid = (gen_grid_size, 1, 1) num_gen = gen_block_size * gen_grid_size assert num_gen <= 20000 # prepare RNG states #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32)) seeds = to_gpu( numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32)) state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>") states = cuda.mem_alloc(num_gen * state_type_size) #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k kernel_initialize(states, seeds.gpudata, block=gen_block, grid=gen_grid, stream=stream) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size) # run calculation args = [states] + [buf.gpudata for buf in buffers] + [numpy.int32(subchunk_size)] if binning is None: results = { quantity: numpy.zeros( (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype) for quantity in quantities } for i in xrange(chunks_num * subchunks_num): kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) for k in xrange(decoherence_steps + 1): if k > 0: kernel_decoherence(*args, block=gen_block, grid=gen_grid, stream=stream) for j, quantity in enumerate(sorted(quantities)): F = (gpuarray.sum(buffers[j], stream=stream) / buffers[j].size).get() results[quantity][k, i] = F for quantity in sorted(quantities): results[quantity] = results[quantity].reshape( decoherence_steps + 1, chunks_num, subchunks_num).mean(2).real.tolist() out_queue.put(results) else: bin_accums = [ numpy.zeros(tuple([binnum] * len(vals)), numpy.int64) for vals, binnum, _ in binning ] bin_edges = [None] * len(binning) for i in xrange(chunks_num * subchunks_num): bin_edges = [] kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) results = { quantity: buffers[j].get().real for j, quantity in enumerate(sorted(quantities)) } for binparam, bin_accum in zip(binning, bin_accums): qnames, binnum, ranges = binparam sample_lines = [results[quantity] for quantity in qnames] sample = numpy.concatenate( [arr.reshape(subchunk_size, 1) for arr in sample_lines], axis=1) hist, edges = numpy.histogramdd(sample, binnum, ranges) bin_accum += hist bin_edges.append(numpy.array(edges)) results = [[acc.tolist(), edges.tolist()] for acc, edges in zip(bin_accums, bin_edges)] out_queue.put(results) #ctx.pop() ctx.detach()
def _make_array(self, shape, dtype): return GPUArray(shape, dtype)