Ejemplo n.º 1
0
class Measurement:
    _kernel = cupy.ReductionKernel(
        "uint64 target_mask, float64 p, T q_inout", "float64 b",
        "(!(_j & target_mask)) * (q_inout * q_inout.conj()).real()", "a + b",
        """
        q_inout = (!(_i & target_mask) == (a <= p)) * q_inout / sqrt((a <= p) * a + (a > b) * (1.0 - a))
        """, "0.0")

    no_cache = True

    def __init__(self):
        pass

    def apply(self, helper, qubits, targets):
        n_qubits = helper["n_qubits"]
        i = helper["indices"]
        p = random.random()
        for target in slicing(targets, n_qubits):
            target_mask = 1 << target
            a = self._kernel(target_mask, p, qubits)
            helper["cregs"][target] = int(a > p)
        return qubits

    def to_qasm(self, helper, targets):
        n_qubits = helper["n_qubits"]
        qasm = []
        for target in slicing(targets, n_qubits):
            qasm.append("measure q[{}] -> c[{}];".format(target, target))
        return qasm
Ejemplo n.º 2
0
    def forward_gpu(self, inputs):
        if not self.gpu_optim:
            return self.forward_cpu(inputs)
        xp = cuda.get_array_module(*inputs)
        x, gamma, beta = inputs
        a = x - xp.mean(x, axis=1, keepdims=True)
        assert len(a.shape) == 2
        H = a.shape[1]

        #         inv_norm = inv_norm_comp(a/math.sqrt(H), axis=1, keepdims=True) # 1.0/xp.sqrt(xp.sum(a*a, axis=1, keepdims=True) + self.eps)

        inv_norm = cp.ReductionKernel(
            'T x',  # input params
            'T y',  # output params
            'x * x',  # map
            'a + b',  # reduce
            'y = 1.0/sqrt(a/%f + %f)' % (H, self.eps),  # post-reduction map
            '0',  # identity value
            'inv_norm_comp'  # kernel name
        )(a, axis=1, keepdims=True)

        self.inv_norm = inv_norm

        normalized, scaled = scale_output(a, inv_norm, gamma, beta)
        self.normalized = normalized
        return scaled,
Ejemplo n.º 3
0
 def test_optimize_reduction_kernel(self):
     my_sum = cupy.ReductionKernel(
         'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum')
     x = testing.shaped_arange((3, 4), cupy)
     y1 = my_sum(x, axis=1)
     with cupyx.optimizing.optimize():
         y2 = my_sum(x, axis=1)
     testing.assert_array_equal(y1, y2)
Ejemplo n.º 4
0
    def test_optimize_cache(self):
        if (_accelerator.ACCELERATOR_CUB
                in _accelerator.get_reduction_accelerators()):
            pytest.skip('optimize cannot be mocked for CUB reduction')

        target = cupyx.optimizing._optimize._optimize
        target_full_name = '{}.{}'.format(target.__module__, target.__name__)

        with mock.patch(target_full_name) as optimize_impl:
            my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b',
                                          'out = a', '0', 'my_sum')
            my_sum_ = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b',
                                           'out = a', '0', 'my_sum_')
            x = testing.shaped_arange((3, 4), cupy)
            x_ = testing.shaped_arange((3, 4), cupy)
            y = testing.shaped_arange((4, 4), cupy)
            z = testing.shaped_arange((3, 4), cupy)[::-1]
            assert x.strides == y.strides
            assert x.shape == z.shape

            with cupyx.optimizing.optimize():
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 1
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 1
                my_sum(x, axis=0)
                assert optimize_impl.call_count == 2
                my_sum(x_, axis=1)
                assert optimize_impl.call_count == 2
                my_sum(y, axis=1)
                assert optimize_impl.call_count == 3
                my_sum(z, axis=1)
                assert optimize_impl.call_count == 4
                my_sum_(x, axis=1)
                assert optimize_impl.call_count == 5

            with cupyx.optimizing.optimize(key='new_key'):
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 6

            with cupyx.optimizing.optimize(key=None):
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 6
                my_sum(x)
                assert optimize_impl.call_count == 7
Ejemplo n.º 5
0
 def test_invalid_kernel_name(self):
     with self.assertRaisesRegex(ValueError, 'Invalid kernel name'):
         cupy.ReductionKernel('T x',
                              'T y',
                              'x',
                              'a + b',
                              'y = a',
                              '0',
                              name='1')
Ejemplo n.º 6
0
 def __get_count_votes_cupy_kernel(invert):
     sign = '>' if invert else '<'
     return cupy.ReductionKernel('X x',
                                 'Y y',
                                 '_raw_x[_in_ind.size()/2]{}x'.format(sign),
                                 'a + b',
                                 'y = a',
                                 '0',
                                 'lt',
                                 reduce_type='int')
Ejemplo n.º 7
0
 def cupy_threshold_local_mean(*args, **kwargs):
     # Code snippet taken from https://github.com/cupy/cupy/issues/3909
     my_mean = cupy.ReductionKernel(
         'T x',  # input params
         'T y',  # output params
         'x',  # map
         'a + b',  # reduce
         'y = a / _in_ind.size()',  # An undocumented variable and a hack
         '0',  # identity value
         'mean'  # kernel name
     )
     return my_mean
Ejemplo n.º 8
0
def reduce(in_params, out_params, map_expr, reduce_expr, post_map_expr,
           identity, name, **kwargs):
    """Creates a global reduction kernel function.

    This function uses :func:`~chainer.cuda.memoize` to cache the resulting
    kernel object, i.e. the resulting kernel object is cached for each argument
    combination and CUDA device.

    The arguments are the same as those for
    :class:`cupy.ReductionKernel`, except that the ``name`` argument is
    mandatory.

    """
    check_cuda_available()
    return cupy.ReductionKernel(in_params, out_params, map_expr, reduce_expr,
                                post_map_expr, identity, name, **kwargs)
Ejemplo n.º 9
0
    def test_optimize_cache_multi_gpus(self):
        target = cupyx.optimizing._optimize._optimize
        target_full_name = '{}.{}'.format(target.__module__, target.__name__)

        with mock.patch(target_full_name) as optimize_impl:
            my_sum = cupy.ReductionKernel(
                'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum')

            with cupyx.optimizing.optimize():
                with cupy.cuda.Device(0):
                    x = testing.shaped_arange((3, 4), cupy)
                    my_sum(x, axis=1)
                    assert optimize_impl.call_count == 1

                with cupy.cuda.Device(1):
                    x = testing.shaped_arange((3, 4), cupy)
                    my_sum(x, axis=1)
                    assert optimize_impl.call_count == 2
Ejemplo n.º 10
0
        def update_core_gpu(self, param):
            grad = param.grad
            if grad is None:
                return
            hp = self.hyperparam
            p = self.state['p']

            if HamiltonianExplicitRule._kernel_x is None:
                HamiltonianExplicitRule._kernel_x = cp.ElementwiseKernel(
                    'T epsilon, T p, T denomp, T param', 'T x',
                    'x = param + epsilon * p / denomp', 'Hamiltonian_x')

            if HamiltonianExplicitRule._kernel_r is None:
                HamiltonianExplicitRule._kernel_r = cp.ReductionKernel(
                    'T p', 'T denomp', 'p * p', 'a + b', 'denomp = sqrt(a)',
                    '1', 'relativistic')

            if hp.approx == 'first':
                # p
                if HamiltonianExplicitRule._kernel_p is None:
                    HamiltonianExplicitRule._kernel_p = cp.ElementwiseKernel(
                        'T delta, T epsilon, T grad, T p0', 'T p1',
                        'p1 = p0 * delta - epsilon * delta * grad',
                        'Hamiltonian_p')
                p = HamiltonianExplicitRule._kernel_p(hp.delta, hp.epsilon,
                                                      grad, p)
                # x
                denomp = HamiltonianExplicitRule._kernel_r(p)
                param.data = HamiltonianExplicitRule._kernel_x(
                    hp.epsilon, p, denomp, param.data)
            else:
                if HamiltonianExplicitRule._kernel_p is None:
                    HamiltonianExplicitRule._kernel_p = cp.ElementwiseKernel(
                        'T delta, T epsilon, T grad, T p0', 'T p1',
                        'p1 = p0 * (2.0 - (1.0 / delta)) - epsilon * grad',
                        'Hamiltonian_p')
                else:
                    # p
                    p = HamiltonianExplicitRule._kernel_p(
                        hp.delta, hp.epsilon, grad, p)
                # x
                denomp = HamiltonianExplicitRule._kernel_r(p)
                param.data = HamiltonianExplicitRule._kernel_x(
                    hp.epsilon, p, denomp, param.data)
Ejemplo n.º 11
0
    def test_optimize_cache_multi_gpus(self):
        if (_accelerator.ACCELERATOR_CUB
                in _accelerator.get_reduction_accelerators()):
            pytest.skip('optimize cannot be mocked for CUB reduction')

        target = cupyx.optimizing._optimize._optimize
        target_full_name = '{}.{}'.format(target.__module__, target.__name__)

        with mock.patch(target_full_name) as optimize_impl:
            my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b',
                                          'out = a', '0', 'my_sum')

            with cupyx.optimizing.optimize():
                with cupy.cuda.Device(0):
                    x = testing.shaped_arange((3, 4), cupy)
                    my_sum(x, axis=1)
                    assert optimize_impl.call_count == 1

                with cupy.cuda.Device(1):
                    x = testing.shaped_arange((3, 4), cupy)
                    my_sum(x, axis=1)
                    assert optimize_impl.call_count == 2
Ejemplo n.º 12
0
    def test_optimize_pickle(self):
        my_sum = cupy.ReductionKernel(
            'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum')
        x = testing.shaped_arange((3, 4), cupy)

        with tempfile.TemporaryDirectory() as directory:
            filepath = directory + '/optimize_params'

            with cupyx.optimizing.optimize() as context:
                my_sum(x, axis=1)
                params_map = context._params_map
                context.save(filepath)

            cupy.core._optimize_config._clear_all_contexts_cache()

            with cupyx.optimizing.optimize() as context:
                assert params_map.keys() != context._params_map.keys()
                context.load(filepath)
                assert params_map.keys() == context._params_map.keys()

            with cupyx.optimizing.optimize(key='other_key') as context:
                with pytest.raises(ValueError):
                    context.load(filepath)
Ejemplo n.º 13
0
        vert_norm[a*3+1] += yy;
        vert_norm[a*3+2] += zz;
        vert_norm[b*3] += xx;
        vert_norm[b*3+1] += yy;
        vert_norm[b*3+2] += zz;
        vert_norm[c*3] += xx;
        vert_norm[c*3+1] += yy;
        vert_norm[c*3+2] += zz;
        float area = sqrt(xx*xx + yy*yy + zz*zz) / 3;
        vertex_weight[a] += area;
        vertex_weight[b] += area;
        vertex_weight[c] += area;
}
''', 'calc_vertex_norm')

calc_avg_vertex = cp.ReductionKernel('T x, T w', 'T y', 'x * w', 'a + b',
                                     'y = a', '0', 'calc_avg_vertex')


class Obj3D(object):
    def __init__(self, filename, lines=None):
        if lines is None:
            file = open(filename, 'r')
            lines = file.readlines()
        iter = 0

        if lines[0].rstrip() == 'OFF':
            self.numVertices, self.numFaces, self.numEdges = (
                int(x) for x in str.split(lines[1]))
            iter = 2
        elif lines[0][0:3] == 'OFF':
            self.numVertices, self.numFaces, self.numEdges = (
Ejemplo n.º 14
0
 def get_sum_func(self):
     return cupy.ReductionKernel(
         'T x', 'T out', 'x', 'a + b', 'out = a', '0', 'my_sum')
import cupy
import numpy

add_kernel = cupy.ReductionKernel(
    "T x",
    "T m",
    "x",
    "a + b",
    "m = a",
    "0",
    "avg",
)

def avg(x):
    return add_kernel(x) / x.size

x = cupy.arange(10, dtype=numpy.float64)

print(avg(x), numpy.mean(x))
Ejemplo n.º 16
0
import cupy as cp

# Double precision

curesiduals = cp.ElementwiseKernel('float64 holo, float64 data, float64 noise',
                                   'float64 residuals',
                                   'residuals = (holo - data) / noise',
                                   'curesiduals')

cuchisqr = cp.ReductionKernel(
    'float64 holo, float64 data, float64 noise', 'float64 chisqr',
    '((holo - data) / noise) * ((holo - data) / noise)', 'a + b', 'chisqr = a',
    '0', 'cuchisqr')

cuabsolute = cp.ReductionKernel('float64 holo, float64 data, float64 noise',
                                'float64 s', 'abs((holo - data) / noise)',
                                'a + b', 's = a', '0', 'cuabsolute')

# Single precision

curesidualsf = cp.ElementwiseKernel(
    'float32 holo, float32 data, float32 noise', 'float32 residuals',
    'residuals = (holo - data) / noise', 'curesiduals')

cuchisqrf = cp.ReductionKernel(
    'float32 holo, float32 data, float32 noise', 'float32 chisqr',
    '((holo - data) / noise) * ((holo - data) / noise)', 'a + b', 'chisqr = a',
    '0', 'cuchisqr')

cuabsolutef = cp.ReductionKernel('float32 holo, float32 data, float32 noise',
                                 'float32 s', 'abs((holo - data) / noise)',
Ejemplo n.º 17
0
class TestFilterComplexFast(FilterTestCaseBase):
    @testing.numpy_cupy_allclose(atol=1e-5, rtol=1e-5, scipy_name='scp')
    @testing.with_requires('scipy>=1.6.0')
    def test_filter(self, xp, scp):
        return self._filter(xp, scp)


# Kernels and Functions for testing generic_filter
rms_raw = cupy.RawKernel(
    '''extern "C" __global__
void rms(const double* x, int filter_size, double* y) {
    double ss = 0;
    for (int i = 0; i < filter_size; ++i) { ss += x[i]*x[i]; }
    y[0] = ss/filter_size;
}''', 'rms')
rms_red = cupy.ReductionKernel('X x', 'Y y', 'x*x', 'a + b',
                               'y = a/_in_ind.size()', '0', 'rms')


def rms_pyfunc(x):
    return (x * x).sum() / len(x)


lt_raw = cupy.RawKernel(
    '''extern "C" __global__
void lt(const double* x, int filter_size, double* y) {
    int n = 0;
    double c = x[filter_size/2];
    for (int i = 0; i < filter_size; ++i) { n += c>x[i]; }
    y[0] = n;
}''', 'lt')
lt_red = cupy.ReductionKernel('X x',
Ejemplo n.º 18
0
 def setUp(self):
     self.my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b',
                                        'out = a', '0', 'my_sum')
Ejemplo n.º 19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import cupy as cp
import numpy as np

if __name__ == '__main__':
    x = cp.arange(10, dtype=np.float32).reshape(2, 5)
    #
    L2norm_kernel = cp.ReductionKernel(
        'T x',  # in_params:入力
        'T y',  # out_params:出力
        ' x * x',  #map_expr:前処理
        'a + b',  #reduce_expr:リデュース
        'y = sqrt(a)',  #post_map_expr:後処理
        '0',  #identity:初期値
        'l2norm')  #name:名前
    y = L2norm_kernel(x)
    print(x)
    print(y)

##from p.13 of https://www.slideshare.net/ryokuta/cupy
#import chainer
#import cupy as cp
##import numpy as np
#
#l2norm_kernel = cp.ReductionKernel(
#  'T x', #input
#  'T y', #output
#  'x * x', #preprocess
#  'a + b',  #reduce ?
#  'y=sqrt(a)', #post-process
Ejemplo n.º 20
0
    w_flat = w.reshape(-1, w.shape[2])
    if w_flat_sq is None:
        w_flat_sq = xp.power(w_flat, 2).sum(axis=1, keepdims=True)

    x_sq = xp.power(x, 2).sum(axis=1, keepdims=True)

    num = xp.dot(x, w_flat.T)
    denum = xp.sqrt(x_sq * w_flat_sq.T)
    similarity = xp.nan_to_num(num / denum)

    return 1 - similarity


if _cupy_available:
    _manhattan_distance_kernel = cp.ReductionKernel('T x, T w', 'T y',
                                                    'abs(x-w)', 'a+b', 'y = a',
                                                    '0', 'l1norm')


def manhattan_distance(x, w, xp=default_xp):
    """Calculate Manhattan distance

    It is very slow (~10x) compared to euclidean distance
    TODO: improve performance. Maybe a custom kernel is necessary

    NB: result shape is (N,X*Y)
    """

    if xp.__name__ == 'cupy':
        d = _manhattan_distance_kernel(x[:, xp.newaxis, xp.newaxis, :],
                                       w[xp.newaxis, :, :, :],
Ejemplo n.º 21
0
import math
import cupy as cp
import numpy as np

l2norm_kernel = cp.ReductionKernel(
    'T x',  # input params
    'T y',  # output params
    'x * x',  # map
    'a + b',  # reduce
    'y = sqrt(a)',  # post-reduction map
    '0',  # identity value
    'l2norm'  # kernel name
)
x = cp.arange(5, dtype=np.float32).reshape(1, 5)

print(x)
print(l2norm_kernel(x, axis=1))
print(math.sqrt(0 * 0 + 1 * 1 + 2 * 2 + 3 * 3 + 4 * 4))
Ejemplo n.º 22
0
import tabulate
import numpy as np
import cupy as cp

from timer import stopwatch

# define ReductionKernel
rd_kinetic_kernel = cp.ReductionKernel(
    'T m, T v', # input
    'T y', # output
    'v * v', # pre-process
    'a + b', # reduction
    'y = 0.5 * m * a', # post-process
    '0', # initial
    'rd_kinetic') # name

@stopwatch
def kinetic_kernel(m, v):
    return rd_kinetic_kernel(m, v)

@stopwatch
def kinetic(m, v):
    return 0.5 * m * np.sum(v*v)

# test ReductionKernel
N = [1,10,1000000,10000000,100000000]
times_cpu = []
times_gpu = []
times_kernel = []
for n in N:
    v = np.sin(np.linspace(-np.pi, np.pi, n)).astype(np.float32) / n
Ejemplo n.º 23
0
    def setup(self, raster_dim, zone_dim, backend):
        W = H = raster_dim
        zW = zH = zone_dim
        # Make sure that the raster dim is multiple of the zones dim
        assert(W % zW == 0)
        assert(H % zH == 0)
        # initialize the values raster
        self.values = get_xr_dataarray((H, W), backend)

        # initialize the zones raster
        zones = xr.DataArray(np.zeros((H, W)))
        hstep = H//zH
        wstep = W//zW
        for i in range(zH):
            for j in range(zW):
                zones[i * hstep: (i+1)*hstep, j*wstep: (j+1)*wstep] = i*zW + j

        ''' zones now looks like this
        >>> zones = np.array([
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
            [2, 2, 2, 2, 2, 3, 3, 3, 3, 3],
            [2, 2, 2, 2, 2, 3, 3, 3, 3, 3],
            [2, 2, 2, 2, 2, 3, 3, 3, 3, 3],
            [2, 2, 2, 2, 2, 3, 3, 3, 3, 3],
            [2, 2, 2, 2, 2, 3, 3, 3, 3, 3]])
        '''

        self.zones = create_arr(zones, backend=backend)

        # Now setup the custom stat funcs
        if backend == 'cupy':
            import cupy
            l2normKernel = cupy.ReductionKernel(
                in_params='T x', out_params='float64 y',
                map_expr='x*x', reduce_expr='a+b',
                post_map_expr='y = sqrt(a)',
                identity='0', name='l2normKernel'
            )
            self.custom_stats = {
                'double_sum': lambda val: val.sum()*2,
                'l2norm': lambda val: np.sqrt(cupy.sum(val * val)),
                'l2normKernel': lambda val: l2normKernel(val)
            }
        else:
            from xrspatial.utils import ngjit

            @ngjit
            def l2normKernel(arr):
                acc = 0
                for x in arr:
                    acc += x * x
                return np.sqrt(acc)

            self.custom_stats = {
                'double_sum': lambda val: val.sum()*2,
                'l2norm': lambda val: np.sqrt(np.sum(val * val)),
                'l2normKernel': lambda val: l2normKernel(val)
            }
Ejemplo n.º 24
0
import numpy as np 
import cupy as cp 
import time

"""
REDUCTION KERNEL
when processing the kernel into a simpler unit
Identity value: This value is used for the initial value of reduction.
Mapping expression: It is used for the pre-processing of each element to be reduced.
Reduction expression: It is an operator to reduce the multiple mapped values. The special variables a and b are used for its operands.
Post mapping expression: It is used to transform the resulting reduced values. The special variable a is used as its input. Output should be written to the output parameter.
"""

normalize_gpu = cp.ReductionKernel(
    'float64 x', #input params
    'float64 y', #output params
    'x * x', # map
    'a + b', # reduce
    'y = sqrt(a)', # post reduction map
    '0', # identity value
    'normalize_gpu' # name
)


x = cp.array([0.0,6.0,8.0])
print(normalize_gpu(x))
Ejemplo n.º 25
0

@contextlib.contextmanager
def timer(message):
    cupy.cuda.Stream.null.synchronize()
    start = time.time()
    yield
    cupy.cuda.Stream.null.synchronize()
    end = time.time()
    print('%s:  %f sec' % (message, end - start))


var_kernel = cupy.ElementwiseKernel(
    'T x0, T x1, T c0, T c1', 'T out',
    'out = (x0 - c0) * (x0 - c0) + (x1 - c1) * (x1 - c1)', 'var_kernel')
sum_kernel = cupy.ReductionKernel('T x, S mask', 'T out', 'mask ? x : 0',
                                  'a + b', 'out = a', '0', 'sum_kernel')
count_kernel = cupy.ReductionKernel('T mask', 'float32 out',
                                    'mask ? 1.0 : 0.0', 'a + b', 'out = a',
                                    '0.0', 'count_kernel')


def fit_xp(X, n_clusters, max_iter):
    assert X.ndim == 2

    # Get NumPy or CuPy module from the supplied array.
    xp = cupy.get_array_module(X)

    n_samples = len(X)

    # Make an array to store the labels indicating which cluster each sample is
    # contained.
Ejemplo n.º 26
0
                header[key.format(i)] = basename(f)
        header['NCOMBINE'] = ldata

        hdu = mkhdu(combined, header=header)

        hdul = fits.HDUList([hdu] + exthdus)
        hdul.writeto(name, **kwargs)
        print('Combine: {:d} frames, Output: {}'.format(ldata, basename(name)))

    return combined


check_sum = cp.ReductionKernel(in_params='T x',
                               out_params='uint64 z',
                               map_expr='(x!=0)',
                               reduce_expr='a+b',
                               post_map_expr='z=a',
                               identity='0',
                               name='check_sum')

weightedsum = cp.ReductionKernel(in_params='T x, T f',
                                 out_params='T y',
                                 map_expr='f * (f ? x : 0)',
                                 reduce_expr='a+b',
                                 post_map_expr='y=a',
                                 identity='0',
                                 name='weightedsum')

weightedvar = cp.ReductionKernel(in_params='T x, T m, T f',
                                 out_params='T y',
                                 map_expr='square(x,m,f)',
Ejemplo n.º 27
0
class KernelList:
    ker_X = cp.ElementwiseKernel(
        in_params="raw T x, int32 k",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    """,
        operation="""
    y = x[i^mask];
    """,
        name="X",
    )

    ker_Y = cp.ElementwiseKernel(
        in_params="raw T x, int32 k",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    thrust::complex<double> img = thrust::complex<double>(0,1);
    """,
        operation="""
    if(i&mask) y = -img*x[i^mask];
    else y = img*x[i^mask];
    """,
        name="Y",
    )

    ker_Z = cp.ElementwiseKernel(
        in_params="T x, int32 k",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    """,
        operation="""
    if(i&mask) y = -x;
    else y = x;
    """,
        name="Z",
    )

    ker_H = cp.ElementwiseKernel(
        in_params="raw T x, int32 k",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    thrust::complex<double> sq2 = thrust::complex<double>(1/sqrt(2.),0);
    """,
        operation="""
    if(i&mask) y = (-x[i] + x[i^mask])*sq2;
    else y = (x[i] + x[i^mask])*sq2;
    """,
        name="H",
    )

    ker_S = cp.ElementwiseKernel(
        in_params="T x, int32 k",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    thrust::complex<double> img = thrust::complex<double>(0,1);
    """,
        operation="""
    if(i&mask) y = img*x;
    else y = x;
    """,
        name="S",
    )

    ker_T = cp.ElementwiseKernel(
        in_params="S x, int32 k",
        out_params="S y",
        loop_prep="""
    int mask = 1<<k;
    thrust::complex<double> ph = thrust::complex<double>(1/sqrt(2.),1/sqrt(2.));
    """,
        operation="""
    if(i&mask) y = ph*x;
    else y = x;
    """,
        name="T",
    )

    ker_CX = cp.ElementwiseKernel(
        in_params="raw T x, int32 k, int32 u",
        out_params="T y",
        loop_prep="""
    int mask_k = 1<<k;
    int mask_u = 1<<u;
    """,
        operation="""
    if(i&mask_k) y = x[i^mask_u];
    else y = x[i];
    """,
        name="CX",
    )

    ker_CZ = cp.ElementwiseKernel(
        in_params="T x, int32 k, int32 u",
        out_params="T y",
        loop_prep="""
    int mask_k = 1<<k;
    int mask_u = 1<<u;
    """,
        operation="""
    if((i&mask_k) && (i&mask_u)) y = -x;
    else y = x;
    """,
        name="CZ",
    )

    ker_Toffoli = cp.ElementwiseKernel(
        in_params="raw T x, int32 k, int32 u, int32 t",
        out_params="T y",
        loop_prep="""
    int mask_k = 1<<k;
    int mask_u = 1<<u;
    int mask_t = 1<<t;
    """,
        operation="""
    if((i&mask_k) && (i&mask_u)) y = x[i^mask_t];
    else y = x[i];
    """,
        name="Toffoli",
    )

    ker_Xrot = cp.ElementwiseKernel(
        in_params="raw T x, int32 k, float64 theta",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    thrust::complex<double> c = thrust::complex<double>(cos(theta),0);
    thrust::complex<double> s = thrust::complex<double>(0,sin(theta));
    """,
        operation="""
    y = c*x[i]+s*x[i^mask];
    """,
        name="Xrot",
    )

    ker_Yrot = cp.ElementwiseKernel(
        in_params="raw T x, int32 k, float64 theta",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    thrust::complex<double> c = thrust::complex<double>(cos(theta),0);
    thrust::complex<double> s = thrust::complex<double>(sin(theta),0);
    """,
        operation="""
    if(i&mask) y = c*x[i]-s*x[i^mask];
    else y = c*x[i]+s*x[i^mask];
    """,
        name="Yrot",
    )

    ker_Zrot = cp.ElementwiseKernel(
        in_params="T x, int32 k, float64 theta",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    thrust::complex<double> c1 = thrust::complex<double>(cos(theta),sin(theta));
    thrust::complex<double> c2 = thrust::complex<double>(cos(theta),-sin(theta));
    """,
        operation="""
    if(i&mask) y = c2*x;
    else y = c1*x;
    """,
        name="Zrot",
    )

    ker_XXrot = cp.ElementwiseKernel(
        in_params="raw T x, int32 k, int32 u,float64 theta",
        out_params="T y",
        loop_prep="""
    int mask_ku = (1<<k)+(1<<u);
    thrust::complex<double> c = thrust::complex<double>(cos(theta),0);
    thrust::complex<double> s = thrust::complex<double>(0,sin(theta));
    """,
        operation="""
    y = c*x[i] + s*x[i^mask_ku];
    """,
        name="XXrot",
    )

    ker_MeasZ0 = cp.ElementwiseKernel(
        in_params="T x, int32 k",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    """,
        operation="""
    if(i&mask) y = 0;
    else y = x;
    """,
        name="MeasZ0",
    )

    ker_MeasZ1 = cp.ElementwiseKernel(
        in_params="T x, int32 k",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    """,
        operation="""
    if(i&mask) y = x;
    else y = 0;
    """,
        name="MeasZ1",
    )

    ker_U = cp.ElementwiseKernel(
        in_params="raw T x, int32 k, float64 t0, float64 t1, float64 t2",
        out_params="T y",
        loop_prep="""
    int mask = 1<<k;
    double t12p = (t1+t2)/2;
    double t12m = (t1-t2)/2;
    thrust::complex<double> u00 = thrust::complex<double>(cos(t12p),sin(-t12p)) * cos(t0/2);
    thrust::complex<double> u01 = thrust::complex<double>(-cos(t12m),-sin(-t12m)) * sin(t0/2);
    thrust::complex<double> u10 = thrust::complex<double>(cos(t12m),sin(t12m)) * sin(t0/2);
    thrust::complex<double> u11 = thrust::complex<double>(cos(t12p),sin(t12p)) * cos(t0/2);
    """,
        operation="""
    if(i&mask) y = u10*x[i^mask] + u11*x[i];
    else y = u00*x[i] + u01*x[i^mask];
    """,
        name="U",
    )

    ker_trace = cp.ReductionKernel("T x", "T y", "x*thrust::conj(x)", "a+b",
                                   "y = a", "0", "trace")

    onePauli = [ker_X, ker_Y, ker_Z]
    oneClifford = onePauli + [ker_H, ker_S]
    oneGate = oneClifford + [ker_T]

    twoPauli = []
    twoClifford = [ker_CX, ker_CZ]
    twoGate = twoPauli + twoClifford + []

    threePauli = []
    threeClifford = []
    threeGate = [ker_Toffoli]

    pauli = onePauli + twoPauli + threePauli
    clifford = oneClifford + twoClifford
    discreteGate = oneGate + twoGate + threeGate

    oneRot = [ker_Xrot, ker_Yrot, ker_Zrot]
    twoRot = [ker_XXrot]
    matchgate = [ker_Zrot, ker_XXrot]
    continuusGate = oneRot + twoRot

    genericGate = [ker_U]

    measurement = [ker_MeasZ0, ker_MeasZ1]

    allGate = discreteGate + continuusGate + measurement + genericGate
    allGateName = [g.name for g in allGate]

    # require target
    oneDiscrete = oneGate
    # require control, target
    twoDiscrete = twoGate
Ejemplo n.º 28
0
        return xp.broadcast_to(x, shape)
    else:
        # numpy 1.9 doesn't support broadcast_to method
        dummy = xp.empty(shape)
        bx, _ = xp.broadcast_arrays(x, dummy)
        return bx


try:
    import cupy as cp

    inv_norm_comp = cp.ReductionKernel(
        'T x',  # input params
        'T y',  # output params
        'x * x',  # map
        'a + b',  # reduce
        'y = 1.0/sqrt(a + 1e-5)',  # post-reduction map
        '0',  # identity value
        'inv_norm_comp'  # kernel name
    )

    scale_output = cp.ElementwiseKernel(
        'T x, T inv_norm, T gamma, T beta', 'T normalized, T scaled', '''
              normalized = x * inv_norm;
              scaled = normalized * gamma + beta;
         ''', 'scale_output')

    backprop_scale = cp.ElementwiseKernel(
        'T inv_norm, T gy_centered, T normalized, T sc_prod', 'T z', '''
              z = inv_norm *(gy_centered - normalized * sc_prod);
         ''', 'backprop_scale')
Ejemplo n.º 29
0
def calculate_trac(image_mat,
                   out: cp.array,
                   delay: int = 1,
                   order: int = 0) -> cp.array:
    """Temporal radiality auto-cumulant"""

    frame_n, h, w = image_mat.shape[:3]

    _trac2 = cp.ReductionKernel(
        "T X1, T X2, T length",
        "T out",
        "X1 * X2",
        "a + b",
        "out = a / length",
        "0",
        "trac2",
    )
    _trac3 = cp.ReductionKernel(
        "T X1, T X2, T X3, T length",
        "T out",
        "X1 * X2 * X3",
        "a + b",
        "out = a / length",
        "0",
        "trac3",
    )
    _trac4 = cp.ReductionKernel(
        "T X1, T X2, T X3, T X4, T length",
        "T out",
        "X1 * X2 * X3 * X4",
        "a + b",
        "out = a / length",
        "0",
        "trac4",
    )
    _subtract_product = cp.ElementwiseKernel("T A, T B",
                                             "T C",
                                             "C -= A * B",
                                             "subctract_product",
                                             no_return=True)
    if 2 * delay > frame_n and order != 0:
        order = 0
        warnings.warn(
            "Total number of frames is too small to do TRAC, using TRA instead"
        )
    deltaRt = image_mat - cp.mean(image_mat, axis=0)

    if order == 0:
        # TRA(time average)
        result = cp.mean(image_mat, axis=0)
    elif order == 2:
        # TRAC2
        if delay == 0:
            A = B = deltaRt
        else:
            A = deltaRt[:-delay]
            B = deltaRt[delay:]
        result = _trac2(A, B, frame_n, axis=0)
    elif order == 3:
        # TRAC3
        if delay == 0:
            A = B = C = deltaRt
        else:
            A = deltaRt[:-2 * delay]
            B = deltaRt[delay:-delay]
            C = deltaRt[2 * delay:]
        result = _trac3(A, B, C, frame_n, axis=0)
    elif order == 4:
        # TRAC4
        if delay == 0:
            A = B = C = D = deltaRt
        else:
            A = deltaRt[:-3 * delay]
            B = deltaRt[delay:-2 * delay]
            C = deltaRt[2 * delay:-delay]
            D = deltaRt[3 * delay:]
        result = _trac4(A, B, C, D, frame_n, axis=0)
        AB = _trac2(A, B, frame_n, axis=0)
        CD = _trac2(C, D, frame_n, axis=0)
        _subtract_product(AB * CD, result)
        del AB, CD
        AC = _trac2(A, C, frame_n, axis=0)
        BD = _trac2(B, D, frame_n, axis=0)
        _subtract_product(AC * BD, result)
        del AC, BD
        AD = _trac2(A, D, frame_n, axis=0)
        BC = _trac2(B, C, frame_n, axis=0)
        _subtract_product(AD * BC, result)
        del AD, BC
    else:
        raise ValueError("sofi-order can only be 2, 3, 4 or 0!")
    # result -= cp.min(result)
    cp.abs(result, out=out)
bit_length = 64
size = 2**17
n_gpu = 8

print('bit', bit_length)
print('size', size)
print('n_gpu', n_gpu)

dtype_from_bit = {
    8: 'uint8',
    16: 'uint16',
    32: 'uint32',
    64: 'uint64',
}

parity_kernel = cupy.ReductionKernel('T x', 'T y', 'x', 'a ^ b', 'y = a', '0',
                                     'parity')


def check(n):
    count = 0
    while n:
        count += 1
        n &= n - 1
    return count


begin_gen = time()
matrix = numpy.random.randint(0, 1 << bit_length, (size, size),
                              dtype_from_bit[bit_length])
end_gen = time()
print('mem', matrix.__sizeof__())