def as_strided(x, shape=None, strides=None): """ Make an ndarray from the given array with the given shape and strides. """ # work around Numpy bug 1873 (reported by Irwin Zaid) # Since this is stolen from numpy, this implementation has the same bug. # http://projects.scipy.org/numpy/ticket/1873 # == https://github.com/numpy/numpy/issues/2466 # Do not recreate the array if nothing need to be changed. # This fixes a lot of errors on pypy since DummyArray hack does not # currently (2014/May/17) on pypy. if ((shape is None or x.shape == shape) and (strides is None or x.strides == strides)): return x if not x.dtype.isbuiltin: if shape is None: shape = x.shape strides = tuple(strides) from pytools import product if strides is not None and shape is not None \ and product(shape) == product(x.shape) \ and x.flags.forc: # Workaround: If we're being asked to do what amounts to a # contiguous reshape, at least do that. if strides == f_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="F")) assert result.strides == strides return result elif strides == c_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="C")) assert result.strides == strides return result raise NotImplementedError( "as_strided won't work on non-builtin arrays for now. " "See https://github.com/numpy/numpy/issues/2466") interface = dict(x.__array_interface__) if shape is not None: interface['shape'] = tuple(shape) if strides is not None: interface['strides'] = tuple(strides) return np.asarray(_DummyArray(interface, base=x))
def grad_monomial(order, rst): """Evaluate the derivative of the monomial of order *order* at the points *rst*. :arg order: A tuple *(i, j,...)* representing the order of the polynomial. :arg rst: ``rst[0], rst[1]`` are arrays of :math:`(r,s,...)` coordinates. (See :ref:`tri-coords`) :return: a tuple of vectors *(dphi_dr, dphi_ds, dphi_dt)*, each of the same length as the *rst* arrays. .. versionadded:: 2016.1 """ dim = len(order) assert dim == rst.shape[0] def diff_monomial(r, o): if o == 0: return 0 * r elif o == 1: return 1 + 0 * r else: return o * r**(o - 1) from pytools import product return tuple( product( (diff_monomial(rst[i], order[i]) if j == i else rst[i]**order[i]) for i in range(dim)) for j in range(dim))
def parametrization_derivative(actx: ArrayContext, dcoll: DiscretizationCollection, dd) -> MultiVector: r"""Computes the product of forward metric derivatives spanning the tangent space with topological dimension *dim*. :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one. Defaults to the base volume discretization. :returns: a :class:`pymbolic.geometric_algebra.MultiVector` containing the product of metric derivatives. """ if dd is None: dd = DD_VOLUME dim = dcoll.discr_from_dd(dd).dim if dim == 0: from pymbolic.geometric_algebra import get_euclidean_space return MultiVector(_signed_face_ones(actx, dcoll, dd), space=get_euclidean_space(dcoll.ambient_dim)) from pytools import product return product( forward_metric_derivative_mv(actx, dcoll, rst_axis, dd) for rst_axis in range(dim))
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): fdecl = super(CUDACASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) if self.target.extern_c: from cgen import Extern fdecl = Extern("C", fdecl) from loopy.schedule import get_insn_ids_for_block_at _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): # Sizes can't have parameter dependencies if they are # to be used in static thread block size. from pytools import product nthreads = product(local_grid_size) fdecl = CudaLaunchBounds(nthreads, fdecl) return fdecl
def __init__(self, center, extent=1, npoints=1000): center = np.asarray(center) self.dimensions, = dim, = center.shape self.a = a = center-extent*0.5 self.b = b = center+extent*0.5 if not isinstance(npoints, tuple): npoints = dim*(npoints,) else: if len(npoints) != dim: raise ValueError("length of npoints must match dimension") for i in range(dim): if npoints[i] == 1: a[i] = center[i] mgrid_index = tuple( slice(a[i], b[i], 1j*npoints[i]) for i in range(dim)) mgrid = np.mgrid[mgrid_index] # (axis, point x idx, point y idx, ...) self.nd_points = mgrid self.points = self.nd_points.reshape(dim, -1).copy() from pytools import product self.npoints = product(npoints)
def nbytes(self): shape = self.shape if self.storage_shape is not None: shape = self.storage_shape from pytools import product return product(si for si in shape)*self.dtype.itemsize
def __init__(self, center, extent=1, npoints=1000): center = np.asarray(center) self.dimensions, = dim, = center.shape self.a = a = center - extent * 0.5 self.b = b = center + extent * 0.5 from numbers import Number if isinstance(npoints, Number): npoints = dim * (npoints, ) else: if len(npoints) != dim: raise ValueError("length of npoints must match dimension") for i in range(dim): if npoints[i] == 1: a[i] = center[i] mgrid_index = tuple( slice(a[i], b[i], 1j * npoints[i]) for i in range(dim)) mgrid = np.mgrid[mgrid_index] # (axis, point x idx, point y idx, ...) self.nd_points = mgrid self.points = self.nd_points.reshape(dim, -1).copy() from pytools import product self.npoints = product(npoints)
def grad_monomial(order, rst): """Evaluate the derivative of the monomial of order *order* at the points *rst*. :arg order: A tuple *(i, j,...)* representing the order of the polynomial. :arg rst: ``rst[0], rst[1]`` are arrays of :math:`(r,s,...)` coordinates. (See :ref:`tri-coords`) :return: a tuple of vectors *(dphi_dr, dphi_ds, dphi_dt)*, each of the same length as the *rst* arrays. .. versionadded:: 2016.1 """ dim = len(order) assert dim == rst.shape[0] def diff_monomial(r, o): if o == 0: return 0*r elif o == 1: return 1+0*r else: return o * r**(o-1) from pytools import product return tuple( product( ( diff_monomial(rst[i], order[i]) if j == i else rst[i] ** order[i]) for i in range(dim) ) for j in range(dim))
def __init__(self, shape, dtype=numpy.float32, stream=None, allocator=drv.mem_alloc,cuda_device=0): try: drv.init() ctx = drv.Device(0).make_context() except RuntimeError: "device is already initialized! so we ignore this ugly, but works for now" #which device are we working on self.cuda_device = cuda_device #internal shape self.shape = shape #internal type self.dtype = numpy.dtype(dtype) from pytools import product #internal size self.size = product(shape) self.allocator = allocator if self.size: self.gpudata = self.allocator(self.size * self.dtype.itemsize) else: self.gpudata = None self.stream = stream self._update_kernel_kwargs()
def nbytes(self): shape = self.shape if self.storage_shape is not None: shape = self.storage_shape from pytools import product return product(si for si in shape) * self.dtype.itemsize
def as_strided(x, shape=None, strides=None): """ Make an ndarray from the given array with the given shape and strides. """ # work around Numpy bug 1873 (reported by Irwin Zaid) # Since this is stolen from numpy, this implementation has the same bug. # http://projects.scipy.org/numpy/ticket/1873 # == https://github.com/numpy/numpy/issues/2466 if not x.dtype.isbuiltin: if (shape is None or x.shape == shape) and \ (strides is None or x.strides == strides): return x if shape is None: shape = x.shape strides = tuple(strides) from pytools import product if strides is not None and shape is not None \ and product(shape) == product(x.shape) \ and x.flags.forc: # Workaround: If we're being asked to do what amounts to a # contiguous reshape, at least do that. if strides == f_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="F")) assert result.strides == strides return result elif strides == c_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="C")) assert result.strides == strides return result raise NotImplementedError( "as_strided won't work on non-builtin arrays for now. " "See https://github.com/numpy/numpy/issues/2466") interface = dict(x.__array_interface__) if shape is not None: interface['shape'] = tuple(shape) if strides is not None: interface['strides'] = tuple(strides) return np.asarray(_DummyArray(interface, base=x))
def parametrization_derivative(ambient_dim, dim, where=None): """Return a :class:`pymbolic.geometric_algebra.MultiVector` representing the derivative of the reference-to-global parametrization. """ par_grad = parametrization_derivative_matrix(ambient_dim, dim, where) from pytools import product return product(MultiVector(vec) for vec in par_grad.T)
def __init__(self, shape, dtype, stream=None): self.shape = shape self.dtype = numpy.dtype(dtype) from pytools import product self.size = product(shape) if self.size: self.gpudata = drv.mem_alloc(self.size * self.dtype.itemsize) else: self.gpudata = None self.stream = stream
def as_strided(x, shape=None, strides=None): """ Make an ndarray from the given array with the given shape and strides. """ # work around Numpy bug 1873 (reported by Irwin Zaid) # Since this is stolen from numpy, this implementation has the same bug. # http://projects.scipy.org/numpy/ticket/1873 if not x.dtype.isbuiltin: if (shape is None or x.shape == shape) and \ (strides is None or x.strides == strides): return x if shape is None: shape = x.shape strides = tuple(strides) from pytools import product if strides is not None and shape is not None \ and product(shape) == product(x.shape) \ and x.flags.forc: # Workaround: If we're being asked to do what amounts to a # contiguous reshape, at least do that. if strides == f_contiguous_strides(x.dtype.itemsize, shape): result = x.reshape(-1).reshape(*shape, order="F") assert result.strides == strides return result elif strides == c_contiguous_strides(x.dtype.itemsize, shape): result = x.reshape(-1).reshape(*shape, order="C") assert result.strides == strides return result raise NotImplementedError( "as_strided won't work on non-builtin arrays for now. " "See http://projects.scipy.org/numpy/ticket/1873") interface = dict(x.__array_interface__) if shape is not None: interface['shape'] = tuple(shape) if strides is not None: interface['strides'] = tuple(strides) return np.asarray(_DummyArray(interface, base=x))
def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: from loopy.diagnostic import warn warn(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) return parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate from pymbolic.mapper.evaluator import UnknownVariableError try: glens = evaluate(glens, parameters) llens = evaluate(llens, parameters) except UnknownVariableError as name: from warnings import warn warn("could not check axis bounds because no value " "for variable '%s' was passed to check_kernels()" % name, LoopyAdvisory) else: for i in range(len(llens)): if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") from pyopencl.characterize import usable_local_mem_size if kernel.local_mem_use() > usable_local_mem_size(device): raise LoopyError("using too much local memory") from loopy.kernel.data import ConstantArg const_arg_count = sum( 1 for arg in kernel.args if isinstance(arg, ConstantArg)) if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments")
def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: from loopy.diagnostic import warn warn(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) return parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately glens, llens = kernel.get_grid_sizes_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate from pymbolic.mapper.evaluator import UnknownVariableError try: glens = evaluate(glens, parameters) llens = evaluate(llens, parameters) except UnknownVariableError as name: from warnings import warn warn("could not check axis bounds because no value " "for variable '%s' was passed to check_kernels()" % name, LoopyAdvisory) else: for i in range(len(llens)): if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") from pyopencl.characterize import usable_local_mem_size if kernel.local_mem_use() > usable_local_mem_size(device): raise LoopyError("using too much local memory") from loopy.kernel.data import ConstantArg const_arg_count = sum( 1 for arg in kernel.args if isinstance(arg, ConstantArg)) if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments")
def parametrization_derivative(ambient_dim, dim=None, dd=None): if dim is None: dim = ambient_dim if dim == 0: return MultiVector(np.array([_SignedFaceOnes(dd)])) from pytools import product return product( forward_metric_derivative_mv(ambient_dim, rst_axis, dd) for rst_axis in range(dim))
def monomial(order, rst): """Evaluate the monomial of order *order* at the points *rst*. :arg order: A tuple *(i, j,...)* representing the order of the polynomial. :arg rst: ``rst[0], rst[1]`` are arrays of :math:`(r,s,...)` coordinates. (See :ref:`tri-coords`) """ dim = len(order) assert dim == rst.shape[0] from pytools import product return product(rst[i] ** order[i] for i in range(dim))
def monomial(order, rst): """Evaluate the monomial of order *order* at the points *rst*. :arg order: A tuple *(i, j,...)* representing the order of the polynomial. :arg rst: ``rst[0], rst[1]`` are arrays of :math:`(r,s,...)` coordinates. (See :ref:`tri-coords`) """ dim = len(order) assert dim == rst.shape[0] from pytools import product return product(rst[i]**order[i] for i in range(dim))
def parametrization_derivative(ambient_dim, dim=None, dd=None): if dim is None: dim = ambient_dim if dim == 0: from pymbolic.geometric_algebra import get_euclidean_space return MultiVector(_SignedFaceOnes(dd), space=get_euclidean_space(ambient_dim)) from pytools import product return product( forward_metric_derivative_mv(ambient_dim, rst_axis, dd) for rst_axis in range(dim))
def diagonal(self, *, get_data=True): no_trace_tensors = [basis.computational_basis_vectors for basis in self.bases] trace_argument = [] n_qubits = self.n_qubits for i, ntt in enumerate(no_trace_tensors): trace_argument.append(ntt) trace_argument.append([i + n_qubits, i]) indices = list(range(n_qubits)) out_indices = list(range(n_qubits, 2 * n_qubits)) complex_dm_dimension = pytools.product(self.dim_hilbert) return np.einsum(self._data, indices, *trace_argument, out_indices, optimize=True).real.reshape(complex_dm_dimension)
def __call__(self, op_class, field): discr = self.discr given = self.plan.given d = discr.dimensions elgroup, = discr.element_groups func, field_texref = self.get_kernel(op_class, elgroup) assert field.dtype == given.float_type field.bind_to_texref_ext(field_texref, allow_double_hack=True) rst_diff = [discr.volume_empty() for axis in range(d)] rst_diff_gpudata = [subarray.gpudata for subarray in rst_diff] gpu_diffmats = self.gpu_diffmats(op_class, elgroup) if discr.instrumented: discr.diff_op_timer.add_timer_callable(func.prepared_timed_call( self.grid, gpu_diffmats.device_memory, *rst_diff_gpudata)) from pytools import product discr.gmem_bytes_diff.add( given.float_size() * ( # matrix fetch gpu_diffmats.block_floats * product(self.grid) # field fetch + given.dofs_per_el() * given.dofs_per_el() * given.microblock.elements * self.grid[1] * self.plan.parallelism.total() # field store + len(discr.nodes) )) else: func.prepared_call(self.grid, gpu_diffmats.device_memory, *rst_diff_gpudata) if False: copied_debugbuf = debugbuf.get() print "DEBUG" #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16)) print copied_debugbuf[:100].reshape((10,10)) raw_input() return rst_diff
def get_rho_distrib(self): compdata = [(i, numpy.array(l), numpy.array(h)) for i, (l, h) in enumerate(self.zipped)] from pytools import product normalization = 1/product(h-l for l, h in self.zipped) lower = numpy.array(self.lower) upper = numpy.array(self.upper) def f(x, el): if (x < lower).any() or (upper < x).any(): return 0 return normalization return f
def wrap_function_declaration(self, kernel, fdecl): from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) if self.extern_c: from cgen import Extern fdecl = Extern("C", fdecl) _, local_grid_size = kernel.get_grid_sizes_as_exprs() from pytools import product nthreads = product(local_grid_size) return CudaLaunchBounds(nthreads, fdecl)
def __call__(self, op_class, field): discr = self.discr given = self.plan.given d = discr.dimensions elgroup, = discr.element_groups func, field_texref = self.get_kernel(op_class, elgroup) assert field.dtype == given.float_type field.bind_to_texref_ext(field_texref, allow_double_hack=True) rst_diff = [discr.volume_empty() for axis in range(d)] rst_diff_gpudata = [subarray.gpudata for subarray in rst_diff] gpu_diffmats = self.gpu_diffmats(op_class, elgroup) if discr.instrumented: discr.diff_op_timer.add_timer_callable( func.prepared_timed_call(self.grid, gpu_diffmats.device_memory, *rst_diff_gpudata)) from pytools import product discr.gmem_bytes_diff.add(given.float_size() * ( # matrix fetch gpu_diffmats.block_floats * product(self.grid) # field fetch + given.dofs_per_el() * given.dofs_per_el() * given.microblock.elements * self.grid[1] * self.plan.parallelism.total() # field store + len(discr.nodes))) else: func.prepared_call(self.grid, gpu_diffmats.device_memory, *rst_diff_gpudata) if False: copied_debugbuf = debugbuf.get() print "DEBUG" #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16)) print copied_debugbuf[:100].reshape((10, 10)) raw_input() return rst_diff
def diagonal(self, *, get_data=True): no_trace_tensors = [ basis.computational_basis_vectors for basis in self.bases ] trace_argument = [] n_qubits = self.n_qubits for i, ntt in enumerate(no_trace_tensors): trace_argument.append(ntt) trace_argument.append([i + n_qubits, i]) indices = list(range(n_qubits)) out_indices = list(range(n_qubits, 2 * n_qubits)) complex_dm_dimension = pytools.product(self.dim_hilbert) return np.einsum(self._data, indices, *trace_argument, out_indices, optimize=True).real.reshape(complex_dm_dimension)
def map_product(self, expr): from pymbolic.primitives import is_constant const = [] nonconst = [] for subexpr in expr.children: if is_constant(subexpr): const.append(subexpr) else: nonconst.append(subexpr) if len(nonconst) > 1: raise RuntimeError("DerivativeTaker doesn't support products with " "more than one non-constant") if not nonconst: nonconst = [1] from pytools import product return product(const) * self.rec(nonconst[0])
def _ensure_gpu_array_shape(self, arr, shape): new_size = pytools.product(shape) new_size_bytes = new_size * 8 if arr.gpudata.size < new_size_bytes: # reallocate try: arr.gpudata.free() out = ga.empty(shape, np.float64) out.gpudata.size = self._work_data.nbytes except Exception as ex: raise RuntimeError( f"Could not allocate a GPU array of shape {shape} " f"and size {new_size_bytes} bytes") from ex else: # reallocation not required, # reshape but reuse allocation out = ga.GPUArray( shape=shape, dtype=np.float64, gpudata=self._work_data.gpudata, ) return out
def get_rho_distrib(self): z_func = self.next.get_rho_distrib() z_count = self.next.count_axes()[0] if self.axis_first: z_slice = slice(0, z_count) my_slice = slice(z_count, None) else: z_slice = slice(len(self.radii), None) my_slice = slice(0, len(self.center)) n = len(self.radii) from math import pi from pyrticle._internal import gamma from pytools import product distr_vol = 2 * pi**(n/2) \ / (gamma(n/2)*n) \ * product(self.radii) if n == 2: normalization = 1/distr_vol def f(x, el): if la.norm((x[my_slice]-self.center)/self.radii) <= 1: return normalization*z_func(x[z_slice], el) else: return 0 elif n == 1: normalization = 2/(pi*distr_vol) def f(x, el): normx = la.norm((x[my_slice]-self.center)/self.radii) if normx <= 1: return normalization\ *z_func(x[z_slice], el)\ *(1-normx**2)**-0.5 else: return 0 else: raise ValueError, "invalid dimension for KV" return f
def generate_linearized_array(array, value): from pytools import product size = product(shape_ax for shape_ax in array.shape) if not isinstance(size, int): raise LoopyError("cannot produce literal for array '%s': " "shape is not a compile-time constant" % array.name) strides = [] data = np.zeros(size, array.dtype.numpy_dtype) from loopy.kernel.array import FixedStrideArrayDimTag for i, dim_tag in enumerate(array.dim_tags): if isinstance(dim_tag, FixedStrideArrayDimTag): if not isinstance(dim_tag.stride, int): raise LoopyError("cannot produce literal for array '%s': " "stride along axis %d (1-based) is not a " "compile-time constant" % (array.name, i+1)) strides.append(dim_tag.stride) else: raise LoopyError("cannot produce literal for array '%s': " "dim_tag type '%s' not supported" % (array.name, type(dim_tag).__name__)) assert array.offset == 0 from pytools import indices_in_shape for ituple in indices_in_shape(value.shape): i = sum(i_ax * strd_ax for i_ax, strd_ax in zip(ituple, strides)) data[i] = value[ituple] return data
def map_parametrization_derivative(self, expr): discr = self.discr_dict[expr.where] from pytential.qbx import LayerPotentialSource if isinstance(discr, LayerPotentialSource): discr = discr.fine_density_discr from meshmode.discretization import Discretization if not isinstance(discr, Discretization): raise RuntimeError("Cannot compute the parametrization derivative " "of something that is not a discretization (a target perhaps?). " "For example, you will receive this error if you try to " "evaluate S' in the volume.") par_grad = np.zeros((discr.ambient_dim, discr.dim), np.object) for i in range(discr.ambient_dim): for j in range(discr.dim): par_grad[i, j] = prim.NumReferenceDerivative( frozenset([j]), prim.NodeCoordinateComponent(i, expr.where), expr.where) from pytools import product return product(MultiVector(vec) for vec in par_grad.T)
def generate_box_mesh(axis_coords, order=1, coord_dtype=np.float64): """Create a semi-structured mesh. :param axis_coords: a tuple with a number of entries corresponding to the number of dimensions, with each entry a numpy array specifying the coordinates to be used along that axis. """ for iaxis, axc in enumerate(axis_coords): if len(axc) < 2: raise ValueError("need at least two points along axis %d" % (iaxis+1)) dim = len(axis_coords) shape = tuple(len(axc) for axc in axis_coords) from pytools import product nvertices = product(shape) vertex_indices = np.arange(nvertices).reshape(*shape, order="F") vertices = np.empty((dim,)+shape, dtype=coord_dtype) for idim in range(dim): vshape = (shape[idim],) + (1,)*idim vertices[idim] = axis_coords[idim].reshape(*vshape) vertices = vertices.reshape(dim, -1) el_vertices = [] if dim == 1: for i in range(shape[0]-1): # a--b a = vertex_indices[i] b = vertex_indices[i+1] el_vertices.append((a, b,)) elif dim == 2: for i in range(shape[0]-1): for j in range(shape[1]-1): # c--d # | | # a--b a = vertex_indices[i, j] b = vertex_indices[i+1, j] c = vertex_indices[i, j+1] d = vertex_indices[i+1, j+1] el_vertices.append((a, b, c)) el_vertices.append((d, c, b)) elif dim == 3: for i in range(shape[0]-1): for j in range(shape[1]-1): for k in range(shape[2]-1): a000 = vertex_indices[i, j, k] a001 = vertex_indices[i, j, k+1] a010 = vertex_indices[i, j+1, k] a011 = vertex_indices[i, j+1, k+1] a100 = vertex_indices[i+1, j, k] a101 = vertex_indices[i+1, j, k+1] a110 = vertex_indices[i+1, j+1, k] a111 = vertex_indices[i+1, j+1, k+1] el_vertices.append((a000, a100, a010, a001)) el_vertices.append((a101, a100, a001, a010)) el_vertices.append((a101, a011, a010, a001)) el_vertices.append((a100, a010, a101, a110)) el_vertices.append((a011, a010, a110, a101)) el_vertices.append((a011, a111, a101, a110)) else: raise NotImplementedError("box meshes of dimension %d" % dim) el_vertices = np.array(el_vertices, dtype=np.int32) grp = make_group_from_vertices( vertices.reshape(dim, -1), el_vertices, order) from meshmode.mesh import Mesh return Mesh(vertices, [grp], nodal_adjacency=None, facial_adjacency_groups=None)
def f(x, el): return product(f(x[sl], el) for f, sl in funcs_and_slices)
def adjust_local_temp_var_storage(kernel, device): import pyopencl as cl import pyopencl.characterize as cl_char logger.debug("%s: adjust temp var storage" % kernel.name) new_temp_vars = {} from loopy.kernel.data import temp_var_scope lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): if temp_var.scope != temp_var_scope.LOCAL: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) if tv.scope == temp_var_scope.LOCAL and tv.name != temp_var.name] storage_shape = temp_var.storage_shape if storage_shape is None: storage_shape = temp_var.shape storage_shape = list(storage_shape) # sizes of all dims except the last one, which we may change # below to avoid bank conflicts from pytools import product if device.local_mem_type == cl.device_local_mem_type.GLOBAL: # FIXME: could try to avoid cache associativity disasters new_storage_shape = storage_shape elif device.local_mem_type == cl.device_local_mem_type.LOCAL: min_mult = cl_char.local_memory_bank_count(device) good_incr = None new_storage_shape = storage_shape min_why_not = None for increment in range(storage_shape[-1]//2): test_storage_shape = storage_shape[:] test_storage_shape[-1] = test_storage_shape[-1] + increment new_mult, why_not = cl_char.why_not_local_access_conflict_free( device, temp_var.dtype.itemsize, temp_var.shape, test_storage_shape) # will choose smallest increment 'automatically' if new_mult < min_mult: new_lmem_use = (sum(other_loctemp_nbytes) + temp_var.dtype.itemsize*product(test_storage_shape)) if new_lmem_use < lmem_size: new_storage_shape = test_storage_shape min_mult = new_mult min_why_not = why_not good_incr = increment if min_mult != 1: from warnings import warn from loopy.diagnostic import LoopyAdvisory warn("could not find a conflict-free mem layout " "for local variable '%s' " "(currently: %dx conflict, increment: %s, reason: %s)" % (temp_var.name, min_mult, good_incr, min_why_not), LoopyAdvisory) else: from warnings import warn warn("unknown type of local memory") new_storage_shape = storage_shape new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) return kernel.copy(temporary_variables=new_temp_vars)
def __len__(self): return pytools.product(self._Dimensions)
def _apply_two_qubit_ptm(self, qubit0, qubit1, ptm): """Apply a two-qubit Pauli transfer matrix to qubit `bit0` and `bit1`. Parameters ---------- ptm: array-like A two-qubit ptm in the basis of `bit0` and `bit1`. Must be a 4D matrix with dimensions, that correspond to the qubits. qubit1 : int Index of first qubit qubit0: int Index of second qubit """ self._validate_qubit(qubit1, 'qubit0') self._validate_qubit(qubit0, 'qubit1') if len(ptm.shape) != 4: raise ValueError("`ptm` must be a 4D array, got {}D".format( len(ptm.shape))) # bit0 must be the more significant bit (bit 0 is msb) if qubit0 > qubit1: qubit0, qubit1 = qubit1, qubit0 ptm = np.einsum("abcd -> badc", ptm) new_shape = list(self._data.shape) dim0_out, dim1_out, dim0_in, dim1_in = ptm.shape assert new_shape[qubit1] == dim1_in assert new_shape[qubit0] == dim0_in new_shape[qubit1] = dim1_out new_shape[qubit0] = dim0_out new_size = pytools.product(new_shape) new_size_bytes = new_size * 8 if self._work_data.gpudata.size < new_size_bytes: # reallocate self._work_data.gpudata.free() self._work_data = ga.empty(new_shape, np.float64) self._work_data.gpudata.size = self._work_data.nbytes else: # reallocation not required, # reshape but reuse allocation self._work_data = ga.GPUArray( shape=new_shape, dtype=np.float64, gpudata=self._work_data.gpudata, ) ptm_gpu = self._cached_gpuarray(ptm) rest_shape = new_shape.copy() rest_shape[qubit1] = 1 rest_shape[qubit0] = 1 dint = 1 for i in sorted(rest_shape): if i * dint > 256 // (dim0_out * dim1_out): break else: dint *= i # dim_a_out, dim_b_out, d_internal (arbitrary) block = (dim0_out, dim1_out, dint) blocksize = dim1_out * dim0_out * dint sh_mem_size = dint * dim1_in * dim0_in # + ptm.size grid_size = max(1, (new_size - 1) // blocksize + 1) grid = (grid_size, 1, 1) dim_z = pytools.product(self._data.shape[qubit1 + 1:]) dim_y = pytools.product(self._data.shape[qubit0 + 1:qubit1]) dim_rho = new_size # self.data.size _two_qubit_general_ptm.prepared_call(grid, block, self._data.gpudata, self._work_data.gpudata, ptm_gpu.gpudata, dim0_in, dim1_in, dim_z, dim_y, dim_rho, shared_size=8 * sh_mem_size) self._data, self._work_data = self._work_data, self._data
def __call__(self, in_vector, prepped_mat, prepped_scaling, out_vector=None): discr = self.discr elgroup, = discr.element_groups given = self.plan.given kernel, in_vector_texref, scaling_texref = \ self.get_kernel(prepped_scaling is not None) if out_vector is None: out_vector = discr.volume_empty() in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True) if prepped_scaling is not None: prepped_scaling.bind_to_texref_ext(scaling_texref, allow_double_hack=True) if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((1024,), dtype=given.float_type) else: debugbuf = FakeGPUArray() if discr.instrumented: discr.el_local_timer.add_timer_callable( kernel.prepared_timed_call( self.grid, out_vector.gpudata, prepped_mat, debugbuf.gpudata, len(discr.blocks)*given.microblocks_per_block, )) from pytools import product discr.gmem_bytes_el_local.add( given.float_size() * ( # matrix fetch self.plan.gpu_matrix_block_floats() * product(self.grid) # field fetch + self.plan.preimage_dofs_per_el * given.dofs_per_el() * given.microblock.elements * self.grid[1] * self.plan.parallelism.total() # field store + len(discr.nodes) )) else: kernel.prepared_call( self.grid, out_vector.gpudata, prepped_mat, debugbuf.gpudata, len(discr.blocks)*given.microblocks_per_block, ) if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7)) print "DEBUG" numpy.set_printoptions(linewidth=100) copied_debugbuf.shape = (144,7) numpy.set_printoptions(threshold=3000) print copied_debugbuf raw_input() return out_vector
def generate_box_mesh(axis_coords, order=1, coord_dtype=np.float64, group_cls=None, boundary_tag_to_face=None, mesh_type=None): r"""Create a semi-structured mesh. :param axis_coords: a tuple with a number of entries corresponding to the number of dimensions, with each entry a numpy array specifying the coordinates to be used along that axis. :param group_cls: One of :class:`meshmode.mesh.SimplexElementGroup` or :class:`meshmode.mesh.TensorProductElementGroup`. :param boundary_tag_to_face: an optional dictionary for tagging boundaries. The keys correspond to custom boundary tags, with the values giving a list of the faces on which they should be applied in terms of coordinate directions (``+x``, ``-x``, ``+y``, ``-y``, ``+z``, ``-z``, ``+w``, ``-w``). For example:: boundary_tag_to_face={"bdry_1": ["+x", "+y"], "bdry_2": ["-x"]} :param mesh_type: In two dimensions with non-tensor-product elements, *mesh_type* may be set to ``"X"`` to generate this type of mesh:: _______ |\ /| | \ / | | X | | / \ | |/ \| ^^^^^^^ instead of the default:: _______ |\ | | \ | | \ | | \ | | \| ^^^^^^^ Specifying a value other than *None* for all other mesh dimensionalities and element types is an error. .. versionchanged:: 2017.1 *group_factory* parameter added. .. versionchanged:: 2020.1 *boundary_tag_to_face* parameter added. .. versionchanged:: 2020.3 *group_factory* deprecated and renamed to *group_cls*. """ if boundary_tag_to_face is None: boundary_tag_to_face = {} for iaxis, axc in enumerate(axis_coords): if len(axc) < 2: raise ValueError("need at least two points along axis %d" % (iaxis + 1)) dim = len(axis_coords) shape = tuple(len(axc) for axc in axis_coords) from pytools import product nvertices = product(shape) vertex_indices = np.arange(nvertices).reshape(*shape) vertices = np.empty((dim, ) + shape, dtype=coord_dtype) for idim in range(dim): vshape = (shape[idim], ) + (1, ) * (dim - 1 - idim) vertices[idim] = axis_coords[idim].reshape(*vshape) vertices = vertices.reshape(dim, -1) from meshmode.mesh import SimplexElementGroup, TensorProductElementGroup if group_cls is None: group_cls = SimplexElementGroup if issubclass(group_cls, SimplexElementGroup): is_tp = False elif issubclass(group_cls, TensorProductElementGroup): is_tp = True else: raise ValueError(f"unsupported value for 'group_cls': {group_cls}") el_vertices = [] if dim == 1: if mesh_type is not None: raise ValueError(f"unsupported mesh type: '{mesh_type}'") for i in range(shape[0] - 1): # a--b a = vertex_indices[i] b = vertex_indices[i + 1] el_vertices.append(( a, b, )) elif dim == 2: if mesh_type == "X" and not is_tp: shape_m1 = tuple(si - 1 for si in shape) nmidpoints = product(shape_m1) midpoint_indices = ( nvertices + np.arange(nmidpoints).reshape(*shape_m1, order="F")) midpoints = np.empty((dim, ) + shape_m1, dtype=coord_dtype) for idim in range(dim): vshape = (shape_m1[idim], ) + (1, ) * idim left_axis_coords = axis_coords[idim][:-1] right_axis_coords = axis_coords[idim][1:] midpoints[idim] = ( 0.5 * (left_axis_coords + right_axis_coords)).reshape(*vshape) midpoints = midpoints.reshape(dim, -1) vertices = np.concatenate((vertices, midpoints), axis=1) elif mesh_type is None: pass else: raise ValueError(f"unsupported mesh type: '{mesh_type}'") for i in range(shape[0] - 1): for j in range(shape[1] - 1): # c--d # | | # a--b a = vertex_indices[i, j] b = vertex_indices[i + 1, j] c = vertex_indices[i, j + 1] d = vertex_indices[i + 1, j + 1] if is_tp: el_vertices.append((a, b, c, d)) elif mesh_type == "X": m = midpoint_indices[i, j] el_vertices.append((a, b, m)) el_vertices.append((b, d, m)) el_vertices.append((d, c, m)) el_vertices.append((c, a, m)) else: el_vertices.append((a, b, c)) el_vertices.append((d, c, b)) elif dim == 3: if mesh_type is not None: raise ValueError("unsupported mesh_type") for i in range(shape[0] - 1): for j in range(shape[1] - 1): for k in range(shape[2] - 1): a000 = vertex_indices[i, j, k] a001 = vertex_indices[i, j, k + 1] a010 = vertex_indices[i, j + 1, k] a011 = vertex_indices[i, j + 1, k + 1] a100 = vertex_indices[i + 1, j, k] a101 = vertex_indices[i + 1, j, k + 1] a110 = vertex_indices[i + 1, j + 1, k] a111 = vertex_indices[i + 1, j + 1, k + 1] if is_tp: el_vertices.append( (a000, a100, a010, a110, a001, a101, a011, a111)) else: el_vertices.append((a000, a100, a010, a001)) el_vertices.append((a101, a100, a001, a010)) el_vertices.append((a101, a011, a010, a001)) el_vertices.append((a100, a010, a101, a110)) el_vertices.append((a011, a010, a110, a101)) el_vertices.append((a011, a111, a101, a110)) else: raise NotImplementedError("box meshes of dimension %d" % dim) el_vertices = np.array(el_vertices, dtype=np.int32) grp = make_group_from_vertices(vertices.reshape(dim, -1), el_vertices, order, group_cls=group_cls) # {{{ compute facial adjacency for mesh if there is tag information facial_adjacency_groups = None face_vertex_indices_to_tags = {} boundary_tags = list(boundary_tag_to_face.keys()) axes = ["x", "y", "z", "w"] if boundary_tags: vert_index_to_tuple = { vertex_indices[itup]: itup for itup in np.ndindex(shape) } for tag_idx, tag in enumerate(boundary_tags): # Need to map the correct face vertices to the boundary tags for face in boundary_tag_to_face[tag]: if len(face) != 2: raise ValueError("face identifier '%s' does not " "consist of exactly two characters" % face) side, axis = face try: axis = axes.index(axis) except ValueError: raise ValueError("unrecognized axis in face identifier '%s'" % face) if axis >= dim: raise ValueError( "axis in face identifier '%s' does not exist in %dD" % (face, dim)) if side == "-": vert_crit = 0 elif side == "+": vert_crit = shape[axis] - 1 else: raise ValueError( "first character of face identifier '%s' is not" "'+' or '-'" % face) for ielem in range(0, grp.nelements): for ref_fvi in grp.face_vertex_indices(): fvi = grp.vertex_indices[ielem, ref_fvi] try: fvi_tuples = [vert_index_to_tuple[i] for i in fvi] except KeyError: # Happens for interior faces of "X" meshes because # midpoints aren't in vert_index_to_tuple. We don't # care about them. continue if all(fvi_tuple[axis] == vert_crit for fvi_tuple in fvi_tuples): key = frozenset(fvi) face_vertex_indices_to_tags.setdefault(key, []).append(tag) if boundary_tags: from meshmode.mesh import (_compute_facial_adjacency_from_vertices, BTAG_ALL, BTAG_REALLY_ALL) boundary_tags.extend([BTAG_ALL, BTAG_REALLY_ALL]) facial_adjacency_groups = _compute_facial_adjacency_from_vertices( [grp], boundary_tags, np.int32, np.int8, face_vertex_indices_to_tags) else: facial_adjacency_groups = None # }}} from meshmode.mesh import Mesh return Mesh(vertices, [grp], facial_adjacency_groups=facial_adjacency_groups, is_conforming=True, boundary_tags=boundary_tags)
def _apply_single_qubit_ptm(self, qubit, ptm): # noinspection PyUnresolvedReferences """Apply a one-qubit Pauli transfer matrix to qubit bit. Parameters ---------- qubit: int Qubit index ptm: array-like A PTM in the basis of a qubit. basis_out: quantumsim.bases.PauliBasis or None If provided, will convert qubit basis to specified after the PTM application. """ new_shape = list(self._data.shape) self._validate_qubit(qubit, 'bit') # TODO Refactor to use self._validate_ptm if len(ptm.shape) != 2: raise ValueError( "`ptm` must be a 2D array, got {}D".format(len(ptm.shape))) dim_bit_out, dim_bit_in = ptm.shape new_shape[qubit] = dim_bit_out assert new_shape[qubit] == dim_bit_out new_size = pytools.product(new_shape) new_size_bytes = new_size * 8 if self._work_data.gpudata.size < new_size_bytes: # reallocate self._work_data.gpudata.free() self._work_data = ga.empty(new_shape, np.float64) self._work_data.gpudata.size = self._work_data.nbytes else: # reallocation not required, # reshape but reuse allocation self._work_data = ga.GPUArray( shape=new_shape, dtype=np.float64, gpudata=self._work_data.gpudata, ) ptm_gpu = self._cached_gpuarray(ptm) dint = min(64, self._data.size // dim_bit_in) block = (1, dim_bit_out, dint) blocksize = dim_bit_out * dint grid_size = max(1, (new_size - 1) // blocksize + 1) grid = (grid_size, 1, 1) dim_z = pytools.product(self._data.shape[qubit + 1:]) dim_y = pytools.product(self._data.shape[:qubit]) dim_rho = new_size # self.data.size _two_qubit_general_ptm.prepared_call( grid, block, self._data.gpudata, self._work_data.gpudata, ptm_gpu.gpudata, 1, dim_bit_in, dim_z, dim_y, dim_rho, shared_size=8 * (ptm.size + blocksize)) self._data, self._work_data = self._work_data, self._data
def diagonal(self, *, get_data=True, target_array=None, flatten=True): """Obtain the diagonal of the density matrix. Parameters ---------- target_array : None or pycuda.gpuarray.array An already-allocated GPU array to which the data will be copied. If `None`, make a new GPU array. get_data : boolean Whether the data should be copied from the GPU. flatten : boolean TODO docstring """ diag_bases = [pb.computational_subbasis() for pb in self.bases] diag_shape = [db.dim_pauli for db in diag_bases] diag_size = pytools.product(diag_shape) if target_array is None: if self._work_data.gpudata.size < diag_size * 8: self._work_data.gpudata.free() self._work_data = ga.empty(diag_shape, np.float64) self._work_data.gpudata.size = self._work_data.nbytes target_array = self._work_data else: if target_array.size < diag_size: raise ValueError( "Size of `target_gpu_array` is too small ({}).\n" "Should be at least {}." .format(target_array.size, diag_size)) idx = [[pb.computational_basis_indices[i] for i in range(pb.dim_hilbert) if pb.computational_basis_indices[i] is not None] for pb in self.bases] idx_j = np.array(list(pytools.flatten(idx))).astype(np.uint32) idx_i = np.cumsum([0] + [len(i) for i in idx][:-1]).astype(np.uint32) xshape = np.array(self._data.shape, np.uint32) yshape = np.array(diag_shape, np.uint32) xshape_gpu = self._cached_gpuarray(xshape) yshape_gpu = self._cached_gpuarray(yshape) idx_i_gpu = self._cached_gpuarray(idx_i) idx_j_gpu = self._cached_gpuarray(idx_j) block = (2 ** 8, 1, 1) grid = (max(1, (diag_size - 1) // 2 ** 8 + 1), 1, 1) if len(yshape) == 0: # brain-dead case, but should be handled according to exp. target_array.set(self._data.get()) else: _multitake.prepared_call( grid, block, self._data.gpudata, target_array.gpudata, idx_i_gpu.gpudata, idx_j_gpu.gpudata, xshape_gpu.gpudata, yshape_gpu.gpudata, np.uint32(len(yshape)) ) if get_data: if flatten: return target_array.get().ravel()[:diag_size] else: return (target_array.get().ravel()[:diag_size] .reshape(diag_shape)) else: return ga.GPUArray(shape=diag_shape, gpudata=target_array.gpudata, dtype=np.float64)
def size(self): return pytools.product(self.dim_hilbert) ** 2
def _apply_two_qubit_ptm(self, qubit0, qubit1, ptm): """Apply a two-qubit Pauli transfer matrix to qubit `bit0` and `bit1`. Parameters ---------- ptm: array-like A two-qubit ptm in the basis of `bit0` and `bit1`. Must be a 4D matrix with dimensions, that correspond to the qubits. qubit1 : int Index of first qubit qubit0: int Index of second qubit """ self._validate_qubit(qubit1, 'qubit0') self._validate_qubit(qubit0, 'qubit1') if len(ptm.shape) != 4: raise ValueError( "`ptm` must be a 4D array, got {}D".format(len(ptm.shape))) # bit0 must be the more significant bit (bit 0 is msb) if qubit0 > qubit1: qubit0, qubit1 = qubit1, qubit0 ptm = np.einsum("abcd -> badc", ptm) new_shape = list(self._data.shape) dim0_out, dim1_out, dim0_in, dim1_in = ptm.shape assert new_shape[qubit1] == dim1_in assert new_shape[qubit0] == dim0_in new_shape[qubit1] = dim1_out new_shape[qubit0] = dim0_out new_size = pytools.product(new_shape) new_size_bytes = new_size * 8 if self._work_data.gpudata.size < new_size_bytes: # reallocate self._work_data.gpudata.free() self._work_data = ga.empty(new_shape, np.float64) self._work_data.gpudata.size = self._work_data.nbytes else: # reallocation not required, # reshape but reuse allocation self._work_data = ga.GPUArray( shape=new_shape, dtype=np.float64, gpudata=self._work_data.gpudata, ) ptm_gpu = self._cached_gpuarray(ptm) rest_shape = new_shape.copy() rest_shape[qubit1] = 1 rest_shape[qubit0] = 1 dint = 1 for i in sorted(rest_shape): if i * dint > 256 // (dim0_out * dim1_out): break else: dint *= i # dim_a_out, dim_b_out, d_internal (arbitrary) block = (dim0_out, dim1_out, dint) blocksize = dim1_out * dim0_out * dint sh_mem_size = dint * dim1_in * dim0_in # + ptm.size grid_size = max(1, (new_size - 1) // blocksize + 1) grid = (grid_size, 1, 1) dim_z = pytools.product(self._data.shape[qubit1 + 1:]) dim_y = pytools.product(self._data.shape[qubit0 + 1:qubit1]) dim_rho = new_size # self.data.size _two_qubit_general_ptm.prepared_call( grid, block, self._data.gpudata, self._work_data.gpudata, ptm_gpu.gpudata, dim0_in, dim1_in, dim_z, dim_y, dim_rho, shared_size=8 * sh_mem_size) self._data, self._work_data = self._work_data, self._data
def size(self): return pytools.product(self.dim_hilbert)**2
def nbytes(self): from pytools import product return product(si for si in self.shape) * self.dtype.itemsize
def generate_box_mesh(axis_coords, order=1, coord_dtype=np.float64, group_factory=None): """Create a semi-structured mesh. :param axis_coords: a tuple with a number of entries corresponding to the number of dimensions, with each entry a numpy array specifying the coordinates to be used along that axis. :param group_factory: One of :class:`meshmode.mesh.SimplexElementGroup` or :class:`meshmode.mesh.TensorProductElementGroup`. .. versionchanged:: 2017.1 *group_factory* parameter added. """ for iaxis, axc in enumerate(axis_coords): if len(axc) < 2: raise ValueError("need at least two points along axis %d" % (iaxis+1)) dim = len(axis_coords) shape = tuple(len(axc) for axc in axis_coords) from pytools import product nvertices = product(shape) vertex_indices = np.arange(nvertices).reshape(*shape, order="F") vertices = np.empty((dim,)+shape, dtype=coord_dtype) for idim in range(dim): vshape = (shape[idim],) + (1,)*idim vertices[idim] = axis_coords[idim].reshape(*vshape) vertices = vertices.reshape(dim, -1) from meshmode.mesh import SimplexElementGroup, TensorProductElementGroup if group_factory is None: group_factory = SimplexElementGroup if issubclass(group_factory, SimplexElementGroup): is_tp = False elif issubclass(group_factory, TensorProductElementGroup): is_tp = True else: raise ValueError("unsupported value for 'group_factory': %s" % group_factory) el_vertices = [] if dim == 1: for i in range(shape[0]-1): # a--b a = vertex_indices[i] b = vertex_indices[i+1] el_vertices.append((a, b,)) elif dim == 2: for i in range(shape[0]-1): for j in range(shape[1]-1): # c--d # | | # a--b a = vertex_indices[i, j] b = vertex_indices[i+1, j] c = vertex_indices[i, j+1] d = vertex_indices[i+1, j+1] if is_tp: el_vertices.append((a, b, c, d)) else: el_vertices.append((a, b, c)) el_vertices.append((d, c, b)) elif dim == 3: for i in range(shape[0]-1): for j in range(shape[1]-1): for k in range(shape[2]-1): a000 = vertex_indices[i, j, k] a001 = vertex_indices[i, j, k+1] a010 = vertex_indices[i, j+1, k] a011 = vertex_indices[i, j+1, k+1] a100 = vertex_indices[i+1, j, k] a101 = vertex_indices[i+1, j, k+1] a110 = vertex_indices[i+1, j+1, k] a111 = vertex_indices[i+1, j+1, k+1] if is_tp: el_vertices.append( (a000, a001, a010, a011, a100, a101, a110, a111)) else: el_vertices.append((a000, a100, a010, a001)) el_vertices.append((a101, a100, a001, a010)) el_vertices.append((a101, a011, a010, a001)) el_vertices.append((a100, a010, a101, a110)) el_vertices.append((a011, a010, a110, a101)) el_vertices.append((a011, a111, a101, a110)) else: raise NotImplementedError("box meshes of dimension %d" % dim) el_vertices = np.array(el_vertices, dtype=np.int32) grp = make_group_from_vertices( vertices.reshape(dim, -1), el_vertices, order, group_factory=group_factory) from meshmode.mesh import Mesh return Mesh(vertices, [grp], is_conforming=True)
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def adjust_local_temp_var_storage(kernel, device): logger.debug("%s: adjust temp var storage" % kernel.name) new_temp_vars = {} lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): if not temp_var.is_local: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) if tv.is_local and tv.name != temp_var.name] storage_shape = temp_var.storage_shape if storage_shape is None: storage_shape = temp_var.shape storage_shape = list(storage_shape) # sizes of all dims except the last one, which we may change # below to avoid bank conflicts from pytools import product if device.local_mem_type == cl.device_local_mem_type.GLOBAL: # FIXME: could try to avoid cache associativity disasters new_storage_shape = storage_shape elif device.local_mem_type == cl.device_local_mem_type.LOCAL: min_mult = cl_char.local_memory_bank_count(device) good_incr = None new_storage_shape = storage_shape min_why_not = None for increment in range(storage_shape[-1]//2): test_storage_shape = storage_shape[:] test_storage_shape[-1] = test_storage_shape[-1] + increment new_mult, why_not = cl_char.why_not_local_access_conflict_free( device, temp_var.dtype.itemsize, temp_var.shape, test_storage_shape) # will choose smallest increment 'automatically' if new_mult < min_mult: new_lmem_use = (sum(other_loctemp_nbytes) + temp_var.dtype.itemsize*product(test_storage_shape)) if new_lmem_use < lmem_size: new_storage_shape = test_storage_shape min_mult = new_mult min_why_not = why_not good_incr = increment if min_mult != 1: from warnings import warn from loopy.diagnostic import LoopyAdvisory warn("could not find a conflict-free mem layout " "for local variable '%s' " "(currently: %dx conflict, increment: %s, reason: %s)" % (temp_var.name, min_mult, good_incr, min_why_not), LoopyAdvisory) else: from warnings import warn warn("unknown type of local memory") new_storage_shape = storage_shape new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) return kernel.copy(temporary_variables=new_temp_vars)
def local_expansions_level_starts(self): from pytools import product return self._expansions_level_starts( lambda nterms: product(self.expansion_shape(nterms)))
def generate_box_mesh(axis_coords, order=1, coord_dtype=np.float64, group_factory=None, boundary_tag_to_face=None): """Create a semi-structured mesh. :param axis_coords: a tuple with a number of entries corresponding to the number of dimensions, with each entry a numpy array specifying the coordinates to be used along that axis. :param group_factory: One of :class:`meshmode.mesh.SimplexElementGroup` or :class:`meshmode.mesh.TensorProductElementGroup`. :param boundary_tag_to_face: an optional dictionary for tagging boundaries. The keys correspond to custom boundary tags, with the values giving a list of the faces on which they should be applied in terms of coordinate directions (``+x``, ``-x``, ``+y``, ``-y``, ``+z``, ``-z``, ``+w``, ``-w``). For example:: boundary_tag_to_face={"bdry_1": ["+x", "+y"], "bdry_2": ["-x"]} .. versionchanged:: 2017.1 *group_factory* parameter added. .. versionchanged:: 2020.1 *boundary_tag_to_face* parameter added. """ if boundary_tag_to_face is None: boundary_tag_to_face = {} for iaxis, axc in enumerate(axis_coords): if len(axc) < 2: raise ValueError("need at least two points along axis %d" % (iaxis + 1)) dim = len(axis_coords) shape = tuple(len(axc) for axc in axis_coords) from pytools import product nvertices = product(shape) vertex_indices = np.arange(nvertices).reshape(*shape, order="F") vertices = np.empty((dim, ) + shape, dtype=coord_dtype) for idim in range(dim): vshape = (shape[idim], ) + (1, ) * idim vertices[idim] = axis_coords[idim].reshape(*vshape) vertices = vertices.reshape(dim, -1) from meshmode.mesh import SimplexElementGroup, TensorProductElementGroup if group_factory is None: group_factory = SimplexElementGroup if issubclass(group_factory, SimplexElementGroup): is_tp = False elif issubclass(group_factory, TensorProductElementGroup): is_tp = True else: raise ValueError("unsupported value for 'group_factory': %s" % group_factory) el_vertices = [] if dim == 1: for i in range(shape[0] - 1): # a--b a = vertex_indices[i] b = vertex_indices[i + 1] el_vertices.append(( a, b, )) elif dim == 2: for i in range(shape[0] - 1): for j in range(shape[1] - 1): # c--d # | | # a--b a = vertex_indices[i, j] b = vertex_indices[i + 1, j] c = vertex_indices[i, j + 1] d = vertex_indices[i + 1, j + 1] if is_tp: el_vertices.append((a, b, c, d)) else: el_vertices.append((a, b, c)) el_vertices.append((d, c, b)) elif dim == 3: for i in range(shape[0] - 1): for j in range(shape[1] - 1): for k in range(shape[2] - 1): a000 = vertex_indices[i, j, k] a001 = vertex_indices[i, j, k + 1] a010 = vertex_indices[i, j + 1, k] a011 = vertex_indices[i, j + 1, k + 1] a100 = vertex_indices[i + 1, j, k] a101 = vertex_indices[i + 1, j, k + 1] a110 = vertex_indices[i + 1, j + 1, k] a111 = vertex_indices[i + 1, j + 1, k + 1] if is_tp: el_vertices.append( (a000, a001, a010, a011, a100, a101, a110, a111)) else: el_vertices.append((a000, a100, a010, a001)) el_vertices.append((a101, a100, a001, a010)) el_vertices.append((a101, a011, a010, a001)) el_vertices.append((a100, a010, a101, a110)) el_vertices.append((a011, a010, a110, a101)) el_vertices.append((a011, a111, a101, a110)) else: raise NotImplementedError("box meshes of dimension %d" % dim) el_vertices = np.array(el_vertices, dtype=np.int32) grp = make_group_from_vertices(vertices.reshape(dim, -1), el_vertices, order, group_factory=group_factory) # {{{ compute facial adjacency for mesh if there is tag information facial_adjacency_groups = None face_vertex_indices_to_tags = {} boundary_tags = list(boundary_tag_to_face.keys()) axes = ["x", "y", "z", "w"] if boundary_tags: vert_index_to_tuple = { vertex_indices[itup]: itup for itup in np.ndindex(shape) } for tag_idx, tag in enumerate(boundary_tags): # Need to map the correct face vertices to the boundary tags for face in boundary_tag_to_face[tag]: if len(face) != 2: raise ValueError("face identifier '%s' does not " "consist of exactly two characters" % face) side, axis = face try: axis = axes.index(axis) except ValueError: raise ValueError("unrecognized axis in face identifier '%s'" % face) if axis >= dim: raise ValueError( "axis in face identifier '%s' does not exist in %dD" % (face, dim)) if side == "-": vert_crit = 0 elif side == "+": vert_crit = shape[axis] - 1 else: raise ValueError( "first character of face identifier '%s' is not" "'+' or '-'" % face) for ielem in range(0, grp.nelements): for ref_fvi in grp.face_vertex_indices(): fvi = grp.vertex_indices[ielem, ref_fvi] fvi_tuples = [vert_index_to_tuple[i] for i in fvi] if all(fvi_tuple[axis] == vert_crit for fvi_tuple in fvi_tuples): key = frozenset(fvi) face_vertex_indices_to_tags.setdefault(key, []).append(tag) if boundary_tags: from meshmode.mesh import (_compute_facial_adjacency_from_vertices, BTAG_ALL, BTAG_REALLY_ALL) boundary_tags.extend([BTAG_ALL, BTAG_REALLY_ALL]) facial_adjacency_groups = _compute_facial_adjacency_from_vertices( [grp], boundary_tags, np.int32, np.int8, face_vertex_indices_to_tags) else: facial_adjacency_groups = None # }}} from meshmode.mesh import Mesh return Mesh(vertices, [grp], facial_adjacency_groups=facial_adjacency_groups, is_conforming=True, boundary_tags=boundary_tags)
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line for tv in sorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.scope != temp_var_scope.GLOBAL: decl = self.wrap_temporary_decl( self.get_temporary_decl(kernel, schedule_index, tv, idi), tv.scope) if tv.initializer is not None: decl = Initializer( decl, generate_array_literal(codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append(tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append(tv.scope) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.scope) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? cast_decl = _ConstRestrictPointer(cast_decl) temp_var_decl = _ConstRestrictPointer(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % (" ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += (idi.dtype.itemsize * product(si for si in idi.shape)) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def _apply_single_qubit_ptm(self, qubit, ptm): # noinspection PyUnresolvedReferences """Apply a one-qubit Pauli transfer matrix to qubit bit. Parameters ---------- qubit: int Qubit index ptm: array-like A PTM in the basis of a qubit. basis_out: quantumsim.bases.PauliBasis or None If provided, will convert qubit basis to specified after the PTM application. """ new_shape = list(self._data.shape) self._validate_qubit(qubit, 'bit') # TODO Refactor to use self._validate_ptm if len(ptm.shape) != 2: raise ValueError("`ptm` must be a 2D array, got {}D".format( len(ptm.shape))) dim_bit_out, dim_bit_in = ptm.shape new_shape[qubit] = dim_bit_out assert new_shape[qubit] == dim_bit_out new_size = pytools.product(new_shape) new_size_bytes = new_size * 8 if self._work_data.gpudata.size < new_size_bytes: # reallocate self._work_data.gpudata.free() self._work_data = ga.empty(new_shape, np.float64) self._work_data.gpudata.size = self._work_data.nbytes else: # reallocation not required, # reshape but reuse allocation self._work_data = ga.GPUArray( shape=new_shape, dtype=np.float64, gpudata=self._work_data.gpudata, ) ptm_gpu = self._cached_gpuarray(ptm) dint = min(64, self._data.size // dim_bit_in) block = (1, dim_bit_out, dint) blocksize = dim_bit_out * dint grid_size = max(1, (new_size - 1) // blocksize + 1) grid = (grid_size, 1, 1) dim_z = pytools.product(self._data.shape[qubit + 1:]) dim_y = pytools.product(self._data.shape[:qubit]) dim_rho = new_size # self.data.size _two_qubit_general_ptm.prepared_call(grid, block, self._data.gpudata, self._work_data.gpudata, ptm_gpu.gpudata, 1, dim_bit_in, dim_z, dim_y, dim_rho, shared_size=8 * (ptm.size + blocksize)) self._data, self._work_data = self._work_data, self._data
def grid_point_count(self): """Returns the number of grid intervals in each direction. """ return pytools.product(self.grid_point_counts())
def map_product(self, expr): from pytools import product return product(self.rec(child) for child in expr.children)
def diagonal(self, *, get_data=True, target_array=None, flatten=True): """Obtain the diagonal of the density matrix. Parameters ---------- target_array : None or pycuda.gpuarray.array An already-allocated GPU array to which the data will be copied. If `None`, make a new GPU array. get_data : boolean Whether the data should be copied from the GPU. flatten : boolean TODO docstring """ diag_bases = [pb.computational_subbasis() for pb in self.bases] diag_shape = [db.dim_pauli for db in diag_bases] diag_size = pytools.product(diag_shape) if target_array is None: if self._work_data.gpudata.size < diag_size * 8: self._work_data.gpudata.free() self._work_data = ga.empty(diag_shape, np.float64) self._work_data.gpudata.size = self._work_data.nbytes target_array = self._work_data else: if target_array.size < diag_size: raise ValueError( "Size of `target_gpu_array` is too small ({}).\n" "Should be at least {}.".format(target_array.size, diag_size)) idx = [[ pb.computational_basis_indices[i] for i in range(pb.dim_hilbert) if pb.computational_basis_indices[i] is not None ] for pb in self.bases] idx_j = np.array(list(pytools.flatten(idx))).astype(np.uint32) idx_i = np.cumsum([0] + [len(i) for i in idx][:-1]).astype(np.uint32) xshape = np.array(self._data.shape, np.uint32) yshape = np.array(diag_shape, np.uint32) xshape_gpu = self._cached_gpuarray(xshape) yshape_gpu = self._cached_gpuarray(yshape) idx_i_gpu = self._cached_gpuarray(idx_i) idx_j_gpu = self._cached_gpuarray(idx_j) block = (2**8, 1, 1) grid = (max(1, (diag_size - 1) // 2**8 + 1), 1, 1) if len(yshape) == 0: # brain-dead case, but should be handled according to exp. target_array.set(self._data.get()) else: _multitake.prepared_call(grid, block, self._data.gpudata, target_array.gpudata, idx_i_gpu.gpudata, idx_j_gpu.gpudata, xshape_gpu.gpudata, yshape_gpu.gpudata, np.uint32(len(yshape))) if get_data: if flatten: return target_array.get().ravel()[:diag_size] else: return ( target_array.get().ravel()[:diag_size].reshape(diag_shape)) else: return ga.GPUArray(shape=diag_shape, gpudata=target_array.gpudata, dtype=np.float64)