class Measurement: _kernel = cupy.ReductionKernel( "uint64 target_mask, float64 p, T q_inout", "float64 b", "(!(_j & target_mask)) * (q_inout * q_inout.conj()).real()", "a + b", """ q_inout = (!(_i & target_mask) == (a <= p)) * q_inout / sqrt((a <= p) * a + (a > b) * (1.0 - a)) """, "0.0") no_cache = True def __init__(self): pass def apply(self, helper, qubits, targets): n_qubits = helper["n_qubits"] i = helper["indices"] p = random.random() for target in slicing(targets, n_qubits): target_mask = 1 << target a = self._kernel(target_mask, p, qubits) helper["cregs"][target] = int(a > p) return qubits def to_qasm(self, helper, targets): n_qubits = helper["n_qubits"] qasm = [] for target in slicing(targets, n_qubits): qasm.append("measure q[{}] -> c[{}];".format(target, target)) return qasm
def forward_gpu(self, inputs): if not self.gpu_optim: return self.forward_cpu(inputs) xp = cuda.get_array_module(*inputs) x, gamma, beta = inputs a = x - xp.mean(x, axis=1, keepdims=True) assert len(a.shape) == 2 H = a.shape[1] # inv_norm = inv_norm_comp(a/math.sqrt(H), axis=1, keepdims=True) # 1.0/xp.sqrt(xp.sum(a*a, axis=1, keepdims=True) + self.eps) inv_norm = cp.ReductionKernel( 'T x', # input params 'T y', # output params 'x * x', # map 'a + b', # reduce 'y = 1.0/sqrt(a/%f + %f)' % (H, self.eps), # post-reduction map '0', # identity value 'inv_norm_comp' # kernel name )(a, axis=1, keepdims=True) self.inv_norm = inv_norm normalized, scaled = scale_output(a, inv_norm, gamma, beta) self.normalized = normalized return scaled,
def test_optimize_reduction_kernel(self): my_sum = cupy.ReductionKernel( 'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum') x = testing.shaped_arange((3, 4), cupy) y1 = my_sum(x, axis=1) with cupyx.optimizing.optimize(): y2 = my_sum(x, axis=1) testing.assert_array_equal(y1, y2)
def test_optimize_cache(self): if (_accelerator.ACCELERATOR_CUB in _accelerator.get_reduction_accelerators()): pytest.skip('optimize cannot be mocked for CUB reduction') target = cupyx.optimizing._optimize._optimize target_full_name = '{}.{}'.format(target.__module__, target.__name__) with mock.patch(target_full_name) as optimize_impl: my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum') my_sum_ = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum_') x = testing.shaped_arange((3, 4), cupy) x_ = testing.shaped_arange((3, 4), cupy) y = testing.shaped_arange((4, 4), cupy) z = testing.shaped_arange((3, 4), cupy)[::-1] assert x.strides == y.strides assert x.shape == z.shape with cupyx.optimizing.optimize(): my_sum(x, axis=1) assert optimize_impl.call_count == 1 my_sum(x, axis=1) assert optimize_impl.call_count == 1 my_sum(x, axis=0) assert optimize_impl.call_count == 2 my_sum(x_, axis=1) assert optimize_impl.call_count == 2 my_sum(y, axis=1) assert optimize_impl.call_count == 3 my_sum(z, axis=1) assert optimize_impl.call_count == 4 my_sum_(x, axis=1) assert optimize_impl.call_count == 5 with cupyx.optimizing.optimize(key='new_key'): my_sum(x, axis=1) assert optimize_impl.call_count == 6 with cupyx.optimizing.optimize(key=None): my_sum(x, axis=1) assert optimize_impl.call_count == 6 my_sum(x) assert optimize_impl.call_count == 7
def test_invalid_kernel_name(self): with self.assertRaisesRegex(ValueError, 'Invalid kernel name'): cupy.ReductionKernel('T x', 'T y', 'x', 'a + b', 'y = a', '0', name='1')
def __get_count_votes_cupy_kernel(invert): sign = '>' if invert else '<' return cupy.ReductionKernel('X x', 'Y y', '_raw_x[_in_ind.size()/2]{}x'.format(sign), 'a + b', 'y = a', '0', 'lt', reduce_type='int')
def cupy_threshold_local_mean(*args, **kwargs): # Code snippet taken from https://github.com/cupy/cupy/issues/3909 my_mean = cupy.ReductionKernel( 'T x', # input params 'T y', # output params 'x', # map 'a + b', # reduce 'y = a / _in_ind.size()', # An undocumented variable and a hack '0', # identity value 'mean' # kernel name ) return my_mean
def reduce(in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name, **kwargs): """Creates a global reduction kernel function. This function uses :func:`~chainer.cuda.memoize` to cache the resulting kernel object, i.e. the resulting kernel object is cached for each argument combination and CUDA device. The arguments are the same as those for :class:`cupy.ReductionKernel`, except that the ``name`` argument is mandatory. """ check_cuda_available() return cupy.ReductionKernel(in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name, **kwargs)
def test_optimize_cache_multi_gpus(self): target = cupyx.optimizing._optimize._optimize target_full_name = '{}.{}'.format(target.__module__, target.__name__) with mock.patch(target_full_name) as optimize_impl: my_sum = cupy.ReductionKernel( 'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum') with cupyx.optimizing.optimize(): with cupy.cuda.Device(0): x = testing.shaped_arange((3, 4), cupy) my_sum(x, axis=1) assert optimize_impl.call_count == 1 with cupy.cuda.Device(1): x = testing.shaped_arange((3, 4), cupy) my_sum(x, axis=1) assert optimize_impl.call_count == 2
def update_core_gpu(self, param): grad = param.grad if grad is None: return hp = self.hyperparam p = self.state['p'] if HamiltonianExplicitRule._kernel_x is None: HamiltonianExplicitRule._kernel_x = cp.ElementwiseKernel( 'T epsilon, T p, T denomp, T param', 'T x', 'x = param + epsilon * p / denomp', 'Hamiltonian_x') if HamiltonianExplicitRule._kernel_r is None: HamiltonianExplicitRule._kernel_r = cp.ReductionKernel( 'T p', 'T denomp', 'p * p', 'a + b', 'denomp = sqrt(a)', '1', 'relativistic') if hp.approx == 'first': # p if HamiltonianExplicitRule._kernel_p is None: HamiltonianExplicitRule._kernel_p = cp.ElementwiseKernel( 'T delta, T epsilon, T grad, T p0', 'T p1', 'p1 = p0 * delta - epsilon * delta * grad', 'Hamiltonian_p') p = HamiltonianExplicitRule._kernel_p(hp.delta, hp.epsilon, grad, p) # x denomp = HamiltonianExplicitRule._kernel_r(p) param.data = HamiltonianExplicitRule._kernel_x( hp.epsilon, p, denomp, param.data) else: if HamiltonianExplicitRule._kernel_p is None: HamiltonianExplicitRule._kernel_p = cp.ElementwiseKernel( 'T delta, T epsilon, T grad, T p0', 'T p1', 'p1 = p0 * (2.0 - (1.0 / delta)) - epsilon * grad', 'Hamiltonian_p') else: # p p = HamiltonianExplicitRule._kernel_p( hp.delta, hp.epsilon, grad, p) # x denomp = HamiltonianExplicitRule._kernel_r(p) param.data = HamiltonianExplicitRule._kernel_x( hp.epsilon, p, denomp, param.data)
def test_optimize_cache_multi_gpus(self): if (_accelerator.ACCELERATOR_CUB in _accelerator.get_reduction_accelerators()): pytest.skip('optimize cannot be mocked for CUB reduction') target = cupyx.optimizing._optimize._optimize target_full_name = '{}.{}'.format(target.__module__, target.__name__) with mock.patch(target_full_name) as optimize_impl: my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum') with cupyx.optimizing.optimize(): with cupy.cuda.Device(0): x = testing.shaped_arange((3, 4), cupy) my_sum(x, axis=1) assert optimize_impl.call_count == 1 with cupy.cuda.Device(1): x = testing.shaped_arange((3, 4), cupy) my_sum(x, axis=1) assert optimize_impl.call_count == 2
def test_optimize_pickle(self): my_sum = cupy.ReductionKernel( 'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum') x = testing.shaped_arange((3, 4), cupy) with tempfile.TemporaryDirectory() as directory: filepath = directory + '/optimize_params' with cupyx.optimizing.optimize() as context: my_sum(x, axis=1) params_map = context._params_map context.save(filepath) cupy.core._optimize_config._clear_all_contexts_cache() with cupyx.optimizing.optimize() as context: assert params_map.keys() != context._params_map.keys() context.load(filepath) assert params_map.keys() == context._params_map.keys() with cupyx.optimizing.optimize(key='other_key') as context: with pytest.raises(ValueError): context.load(filepath)
vert_norm[a*3+1] += yy; vert_norm[a*3+2] += zz; vert_norm[b*3] += xx; vert_norm[b*3+1] += yy; vert_norm[b*3+2] += zz; vert_norm[c*3] += xx; vert_norm[c*3+1] += yy; vert_norm[c*3+2] += zz; float area = sqrt(xx*xx + yy*yy + zz*zz) / 3; vertex_weight[a] += area; vertex_weight[b] += area; vertex_weight[c] += area; } ''', 'calc_vertex_norm') calc_avg_vertex = cp.ReductionKernel('T x, T w', 'T y', 'x * w', 'a + b', 'y = a', '0', 'calc_avg_vertex') class Obj3D(object): def __init__(self, filename, lines=None): if lines is None: file = open(filename, 'r') lines = file.readlines() iter = 0 if lines[0].rstrip() == 'OFF': self.numVertices, self.numFaces, self.numEdges = ( int(x) for x in str.split(lines[1])) iter = 2 elif lines[0][0:3] == 'OFF': self.numVertices, self.numFaces, self.numEdges = (
def get_sum_func(self): return cupy.ReductionKernel( 'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum')
import cupy import numpy add_kernel = cupy.ReductionKernel( "T x", "T m", "x", "a + b", "m = a", "0", "avg", ) def avg(x): return add_kernel(x) / x.size x = cupy.arange(10, dtype=numpy.float64) print(avg(x), numpy.mean(x))
import cupy as cp # Double precision curesiduals = cp.ElementwiseKernel('float64 holo, float64 data, float64 noise', 'float64 residuals', 'residuals = (holo - data) / noise', 'curesiduals') cuchisqr = cp.ReductionKernel( 'float64 holo, float64 data, float64 noise', 'float64 chisqr', '((holo - data) / noise) * ((holo - data) / noise)', 'a + b', 'chisqr = a', '0', 'cuchisqr') cuabsolute = cp.ReductionKernel('float64 holo, float64 data, float64 noise', 'float64 s', 'abs((holo - data) / noise)', 'a + b', 's = a', '0', 'cuabsolute') # Single precision curesidualsf = cp.ElementwiseKernel( 'float32 holo, float32 data, float32 noise', 'float32 residuals', 'residuals = (holo - data) / noise', 'curesiduals') cuchisqrf = cp.ReductionKernel( 'float32 holo, float32 data, float32 noise', 'float32 chisqr', '((holo - data) / noise) * ((holo - data) / noise)', 'a + b', 'chisqr = a', '0', 'cuchisqr') cuabsolutef = cp.ReductionKernel('float32 holo, float32 data, float32 noise', 'float32 s', 'abs((holo - data) / noise)',
class TestFilterComplexFast(FilterTestCaseBase): @testing.numpy_cupy_allclose(atol=1e-5, rtol=1e-5, scipy_name='scp') @testing.with_requires('scipy>=1.6.0') def test_filter(self, xp, scp): return self._filter(xp, scp) # Kernels and Functions for testing generic_filter rms_raw = cupy.RawKernel( '''extern "C" __global__ void rms(const double* x, int filter_size, double* y) { double ss = 0; for (int i = 0; i < filter_size; ++i) { ss += x[i]*x[i]; } y[0] = ss/filter_size; }''', 'rms') rms_red = cupy.ReductionKernel('X x', 'Y y', 'x*x', 'a + b', 'y = a/_in_ind.size()', '0', 'rms') def rms_pyfunc(x): return (x * x).sum() / len(x) lt_raw = cupy.RawKernel( '''extern "C" __global__ void lt(const double* x, int filter_size, double* y) { int n = 0; double c = x[filter_size/2]; for (int i = 0; i < filter_size; ++i) { n += c>x[i]; } y[0] = n; }''', 'lt') lt_red = cupy.ReductionKernel('X x',
def setUp(self): self.my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum')
#!/usr/bin/env python # -*- coding: utf-8 -*- import cupy as cp import numpy as np if __name__ == '__main__': x = cp.arange(10, dtype=np.float32).reshape(2, 5) # L2norm_kernel = cp.ReductionKernel( 'T x', # in_params:入力 'T y', # out_params:出力 ' x * x', #map_expr:前処理 'a + b', #reduce_expr:リデュース 'y = sqrt(a)', #post_map_expr:後処理 '0', #identity:初期値 'l2norm') #name:名前 y = L2norm_kernel(x) print(x) print(y) ##from p.13 of https://www.slideshare.net/ryokuta/cupy #import chainer #import cupy as cp ##import numpy as np # #l2norm_kernel = cp.ReductionKernel( # 'T x', #input # 'T y', #output # 'x * x', #preprocess # 'a + b', #reduce ? # 'y=sqrt(a)', #post-process
w_flat = w.reshape(-1, w.shape[2]) if w_flat_sq is None: w_flat_sq = xp.power(w_flat, 2).sum(axis=1, keepdims=True) x_sq = xp.power(x, 2).sum(axis=1, keepdims=True) num = xp.dot(x, w_flat.T) denum = xp.sqrt(x_sq * w_flat_sq.T) similarity = xp.nan_to_num(num / denum) return 1 - similarity if _cupy_available: _manhattan_distance_kernel = cp.ReductionKernel('T x, T w', 'T y', 'abs(x-w)', 'a+b', 'y = a', '0', 'l1norm') def manhattan_distance(x, w, xp=default_xp): """Calculate Manhattan distance It is very slow (~10x) compared to euclidean distance TODO: improve performance. Maybe a custom kernel is necessary NB: result shape is (N,X*Y) """ if xp.__name__ == 'cupy': d = _manhattan_distance_kernel(x[:, xp.newaxis, xp.newaxis, :], w[xp.newaxis, :, :, :],
import math import cupy as cp import numpy as np l2norm_kernel = cp.ReductionKernel( 'T x', # input params 'T y', # output params 'x * x', # map 'a + b', # reduce 'y = sqrt(a)', # post-reduction map '0', # identity value 'l2norm' # kernel name ) x = cp.arange(5, dtype=np.float32).reshape(1, 5) print(x) print(l2norm_kernel(x, axis=1)) print(math.sqrt(0 * 0 + 1 * 1 + 2 * 2 + 3 * 3 + 4 * 4))
import tabulate import numpy as np import cupy as cp from timer import stopwatch # define ReductionKernel rd_kinetic_kernel = cp.ReductionKernel( 'T m, T v', # input 'T y', # output 'v * v', # pre-process 'a + b', # reduction 'y = 0.5 * m * a', # post-process '0', # initial 'rd_kinetic') # name @stopwatch def kinetic_kernel(m, v): return rd_kinetic_kernel(m, v) @stopwatch def kinetic(m, v): return 0.5 * m * np.sum(v*v) # test ReductionKernel N = [1,10,1000000,10000000,100000000] times_cpu = [] times_gpu = [] times_kernel = [] for n in N: v = np.sin(np.linspace(-np.pi, np.pi, n)).astype(np.float32) / n
def setup(self, raster_dim, zone_dim, backend): W = H = raster_dim zW = zH = zone_dim # Make sure that the raster dim is multiple of the zones dim assert(W % zW == 0) assert(H % zH == 0) # initialize the values raster self.values = get_xr_dataarray((H, W), backend) # initialize the zones raster zones = xr.DataArray(np.zeros((H, W))) hstep = H//zH wstep = W//zW for i in range(zH): for j in range(zW): zones[i * hstep: (i+1)*hstep, j*wstep: (j+1)*wstep] = i*zW + j ''' zones now looks like this >>> zones = np.array([ [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [2, 2, 2, 2, 2, 3, 3, 3, 3, 3], [2, 2, 2, 2, 2, 3, 3, 3, 3, 3], [2, 2, 2, 2, 2, 3, 3, 3, 3, 3], [2, 2, 2, 2, 2, 3, 3, 3, 3, 3], [2, 2, 2, 2, 2, 3, 3, 3, 3, 3]]) ''' self.zones = create_arr(zones, backend=backend) # Now setup the custom stat funcs if backend == 'cupy': import cupy l2normKernel = cupy.ReductionKernel( in_params='T x', out_params='float64 y', map_expr='x*x', reduce_expr='a+b', post_map_expr='y = sqrt(a)', identity='0', name='l2normKernel' ) self.custom_stats = { 'double_sum': lambda val: val.sum()*2, 'l2norm': lambda val: np.sqrt(cupy.sum(val * val)), 'l2normKernel': lambda val: l2normKernel(val) } else: from xrspatial.utils import ngjit @ngjit def l2normKernel(arr): acc = 0 for x in arr: acc += x * x return np.sqrt(acc) self.custom_stats = { 'double_sum': lambda val: val.sum()*2, 'l2norm': lambda val: np.sqrt(np.sum(val * val)), 'l2normKernel': lambda val: l2normKernel(val) }
import numpy as np import cupy as cp import time """ REDUCTION KERNEL when processing the kernel into a simpler unit Identity value: This value is used for the initial value of reduction. Mapping expression: It is used for the pre-processing of each element to be reduced. Reduction expression: It is an operator to reduce the multiple mapped values. The special variables a and b are used for its operands. Post mapping expression: It is used to transform the resulting reduced values. The special variable a is used as its input. Output should be written to the output parameter. """ normalize_gpu = cp.ReductionKernel( 'float64 x', #input params 'float64 y', #output params 'x * x', # map 'a + b', # reduce 'y = sqrt(a)', # post reduction map '0', # identity value 'normalize_gpu' # name ) x = cp.array([0.0,6.0,8.0]) print(normalize_gpu(x))
@contextlib.contextmanager def timer(message): cupy.cuda.Stream.null.synchronize() start = time.time() yield cupy.cuda.Stream.null.synchronize() end = time.time() print('%s: %f sec' % (message, end - start)) var_kernel = cupy.ElementwiseKernel( 'T x0, T x1, T c0, T c1', 'T out', 'out = (x0 - c0) * (x0 - c0) + (x1 - c1) * (x1 - c1)', 'var_kernel') sum_kernel = cupy.ReductionKernel('T x, S mask', 'T out', 'mask ? x : 0', 'a + b', 'out = a', '0', 'sum_kernel') count_kernel = cupy.ReductionKernel('T mask', 'float32 out', 'mask ? 1.0 : 0.0', 'a + b', 'out = a', '0.0', 'count_kernel') def fit_xp(X, n_clusters, max_iter): assert X.ndim == 2 # Get NumPy or CuPy module from the supplied array. xp = cupy.get_array_module(X) n_samples = len(X) # Make an array to store the labels indicating which cluster each sample is # contained.
header[key.format(i)] = basename(f) header['NCOMBINE'] = ldata hdu = mkhdu(combined, header=header) hdul = fits.HDUList([hdu] + exthdus) hdul.writeto(name, **kwargs) print('Combine: {:d} frames, Output: {}'.format(ldata, basename(name))) return combined check_sum = cp.ReductionKernel(in_params='T x', out_params='uint64 z', map_expr='(x!=0)', reduce_expr='a+b', post_map_expr='z=a', identity='0', name='check_sum') weightedsum = cp.ReductionKernel(in_params='T x, T f', out_params='T y', map_expr='f * (f ? x : 0)', reduce_expr='a+b', post_map_expr='y=a', identity='0', name='weightedsum') weightedvar = cp.ReductionKernel(in_params='T x, T m, T f', out_params='T y', map_expr='square(x,m,f)',
class KernelList: ker_X = cp.ElementwiseKernel( in_params="raw T x, int32 k", out_params="T y", loop_prep=""" int mask = 1<<k; """, operation=""" y = x[i^mask]; """, name="X", ) ker_Y = cp.ElementwiseKernel( in_params="raw T x, int32 k", out_params="T y", loop_prep=""" int mask = 1<<k; thrust::complex<double> img = thrust::complex<double>(0,1); """, operation=""" if(i&mask) y = -img*x[i^mask]; else y = img*x[i^mask]; """, name="Y", ) ker_Z = cp.ElementwiseKernel( in_params="T x, int32 k", out_params="T y", loop_prep=""" int mask = 1<<k; """, operation=""" if(i&mask) y = -x; else y = x; """, name="Z", ) ker_H = cp.ElementwiseKernel( in_params="raw T x, int32 k", out_params="T y", loop_prep=""" int mask = 1<<k; thrust::complex<double> sq2 = thrust::complex<double>(1/sqrt(2.),0); """, operation=""" if(i&mask) y = (-x[i] + x[i^mask])*sq2; else y = (x[i] + x[i^mask])*sq2; """, name="H", ) ker_S = cp.ElementwiseKernel( in_params="T x, int32 k", out_params="T y", loop_prep=""" int mask = 1<<k; thrust::complex<double> img = thrust::complex<double>(0,1); """, operation=""" if(i&mask) y = img*x; else y = x; """, name="S", ) ker_T = cp.ElementwiseKernel( in_params="S x, int32 k", out_params="S y", loop_prep=""" int mask = 1<<k; thrust::complex<double> ph = thrust::complex<double>(1/sqrt(2.),1/sqrt(2.)); """, operation=""" if(i&mask) y = ph*x; else y = x; """, name="T", ) ker_CX = cp.ElementwiseKernel( in_params="raw T x, int32 k, int32 u", out_params="T y", loop_prep=""" int mask_k = 1<<k; int mask_u = 1<<u; """, operation=""" if(i&mask_k) y = x[i^mask_u]; else y = x[i]; """, name="CX", ) ker_CZ = cp.ElementwiseKernel( in_params="T x, int32 k, int32 u", out_params="T y", loop_prep=""" int mask_k = 1<<k; int mask_u = 1<<u; """, operation=""" if((i&mask_k) && (i&mask_u)) y = -x; else y = x; """, name="CZ", ) ker_Toffoli = cp.ElementwiseKernel( in_params="raw T x, int32 k, int32 u, int32 t", out_params="T y", loop_prep=""" int mask_k = 1<<k; int mask_u = 1<<u; int mask_t = 1<<t; """, operation=""" if((i&mask_k) && (i&mask_u)) y = x[i^mask_t]; else y = x[i]; """, name="Toffoli", ) ker_Xrot = cp.ElementwiseKernel( in_params="raw T x, int32 k, float64 theta", out_params="T y", loop_prep=""" int mask = 1<<k; thrust::complex<double> c = thrust::complex<double>(cos(theta),0); thrust::complex<double> s = thrust::complex<double>(0,sin(theta)); """, operation=""" y = c*x[i]+s*x[i^mask]; """, name="Xrot", ) ker_Yrot = cp.ElementwiseKernel( in_params="raw T x, int32 k, float64 theta", out_params="T y", loop_prep=""" int mask = 1<<k; thrust::complex<double> c = thrust::complex<double>(cos(theta),0); thrust::complex<double> s = thrust::complex<double>(sin(theta),0); """, operation=""" if(i&mask) y = c*x[i]-s*x[i^mask]; else y = c*x[i]+s*x[i^mask]; """, name="Yrot", ) ker_Zrot = cp.ElementwiseKernel( in_params="T x, int32 k, float64 theta", out_params="T y", loop_prep=""" int mask = 1<<k; thrust::complex<double> c1 = thrust::complex<double>(cos(theta),sin(theta)); thrust::complex<double> c2 = thrust::complex<double>(cos(theta),-sin(theta)); """, operation=""" if(i&mask) y = c2*x; else y = c1*x; """, name="Zrot", ) ker_XXrot = cp.ElementwiseKernel( in_params="raw T x, int32 k, int32 u,float64 theta", out_params="T y", loop_prep=""" int mask_ku = (1<<k)+(1<<u); thrust::complex<double> c = thrust::complex<double>(cos(theta),0); thrust::complex<double> s = thrust::complex<double>(0,sin(theta)); """, operation=""" y = c*x[i] + s*x[i^mask_ku]; """, name="XXrot", ) ker_MeasZ0 = cp.ElementwiseKernel( in_params="T x, int32 k", out_params="T y", loop_prep=""" int mask = 1<<k; """, operation=""" if(i&mask) y = 0; else y = x; """, name="MeasZ0", ) ker_MeasZ1 = cp.ElementwiseKernel( in_params="T x, int32 k", out_params="T y", loop_prep=""" int mask = 1<<k; """, operation=""" if(i&mask) y = x; else y = 0; """, name="MeasZ1", ) ker_U = cp.ElementwiseKernel( in_params="raw T x, int32 k, float64 t0, float64 t1, float64 t2", out_params="T y", loop_prep=""" int mask = 1<<k; double t12p = (t1+t2)/2; double t12m = (t1-t2)/2; thrust::complex<double> u00 = thrust::complex<double>(cos(t12p),sin(-t12p)) * cos(t0/2); thrust::complex<double> u01 = thrust::complex<double>(-cos(t12m),-sin(-t12m)) * sin(t0/2); thrust::complex<double> u10 = thrust::complex<double>(cos(t12m),sin(t12m)) * sin(t0/2); thrust::complex<double> u11 = thrust::complex<double>(cos(t12p),sin(t12p)) * cos(t0/2); """, operation=""" if(i&mask) y = u10*x[i^mask] + u11*x[i]; else y = u00*x[i] + u01*x[i^mask]; """, name="U", ) ker_trace = cp.ReductionKernel("T x", "T y", "x*thrust::conj(x)", "a+b", "y = a", "0", "trace") onePauli = [ker_X, ker_Y, ker_Z] oneClifford = onePauli + [ker_H, ker_S] oneGate = oneClifford + [ker_T] twoPauli = [] twoClifford = [ker_CX, ker_CZ] twoGate = twoPauli + twoClifford + [] threePauli = [] threeClifford = [] threeGate = [ker_Toffoli] pauli = onePauli + twoPauli + threePauli clifford = oneClifford + twoClifford discreteGate = oneGate + twoGate + threeGate oneRot = [ker_Xrot, ker_Yrot, ker_Zrot] twoRot = [ker_XXrot] matchgate = [ker_Zrot, ker_XXrot] continuusGate = oneRot + twoRot genericGate = [ker_U] measurement = [ker_MeasZ0, ker_MeasZ1] allGate = discreteGate + continuusGate + measurement + genericGate allGateName = [g.name for g in allGate] # require target oneDiscrete = oneGate # require control, target twoDiscrete = twoGate
return xp.broadcast_to(x, shape) else: # numpy 1.9 doesn't support broadcast_to method dummy = xp.empty(shape) bx, _ = xp.broadcast_arrays(x, dummy) return bx try: import cupy as cp inv_norm_comp = cp.ReductionKernel( 'T x', # input params 'T y', # output params 'x * x', # map 'a + b', # reduce 'y = 1.0/sqrt(a + 1e-5)', # post-reduction map '0', # identity value 'inv_norm_comp' # kernel name ) scale_output = cp.ElementwiseKernel( 'T x, T inv_norm, T gamma, T beta', 'T normalized, T scaled', ''' normalized = x * inv_norm; scaled = normalized * gamma + beta; ''', 'scale_output') backprop_scale = cp.ElementwiseKernel( 'T inv_norm, T gy_centered, T normalized, T sc_prod', 'T z', ''' z = inv_norm *(gy_centered - normalized * sc_prod); ''', 'backprop_scale')
def calculate_trac(image_mat, out: cp.array, delay: int = 1, order: int = 0) -> cp.array: """Temporal radiality auto-cumulant""" frame_n, h, w = image_mat.shape[:3] _trac2 = cp.ReductionKernel( "T X1, T X2, T length", "T out", "X1 * X2", "a + b", "out = a / length", "0", "trac2", ) _trac3 = cp.ReductionKernel( "T X1, T X2, T X3, T length", "T out", "X1 * X2 * X3", "a + b", "out = a / length", "0", "trac3", ) _trac4 = cp.ReductionKernel( "T X1, T X2, T X3, T X4, T length", "T out", "X1 * X2 * X3 * X4", "a + b", "out = a / length", "0", "trac4", ) _subtract_product = cp.ElementwiseKernel("T A, T B", "T C", "C -= A * B", "subctract_product", no_return=True) if 2 * delay > frame_n and order != 0: order = 0 warnings.warn( "Total number of frames is too small to do TRAC, using TRA instead" ) deltaRt = image_mat - cp.mean(image_mat, axis=0) if order == 0: # TRA(time average) result = cp.mean(image_mat, axis=0) elif order == 2: # TRAC2 if delay == 0: A = B = deltaRt else: A = deltaRt[:-delay] B = deltaRt[delay:] result = _trac2(A, B, frame_n, axis=0) elif order == 3: # TRAC3 if delay == 0: A = B = C = deltaRt else: A = deltaRt[:-2 * delay] B = deltaRt[delay:-delay] C = deltaRt[2 * delay:] result = _trac3(A, B, C, frame_n, axis=0) elif order == 4: # TRAC4 if delay == 0: A = B = C = D = deltaRt else: A = deltaRt[:-3 * delay] B = deltaRt[delay:-2 * delay] C = deltaRt[2 * delay:-delay] D = deltaRt[3 * delay:] result = _trac4(A, B, C, D, frame_n, axis=0) AB = _trac2(A, B, frame_n, axis=0) CD = _trac2(C, D, frame_n, axis=0) _subtract_product(AB * CD, result) del AB, CD AC = _trac2(A, C, frame_n, axis=0) BD = _trac2(B, D, frame_n, axis=0) _subtract_product(AC * BD, result) del AC, BD AD = _trac2(A, D, frame_n, axis=0) BC = _trac2(B, C, frame_n, axis=0) _subtract_product(AD * BC, result) del AD, BC else: raise ValueError("sofi-order can only be 2, 3, 4 or 0!") # result -= cp.min(result) cp.abs(result, out=out)
bit_length = 64 size = 2**17 n_gpu = 8 print('bit', bit_length) print('size', size) print('n_gpu', n_gpu) dtype_from_bit = { 8: 'uint8', 16: 'uint16', 32: 'uint32', 64: 'uint64', } parity_kernel = cupy.ReductionKernel('T x', 'T y', 'x', 'a ^ b', 'y = a', '0', 'parity') def check(n): count = 0 while n: count += 1 n &= n - 1 return count begin_gen = time() matrix = numpy.random.randint(0, 1 << bit_length, (size, size), dtype_from_bit[bit_length]) end_gen = time() print('mem', matrix.__sizeof__())