Esempio n. 1
0
def uniform_integer(bijection, dtype, low, high=None):
    """
    Generates uniformly distributed integer numbers in the interval ``[low, high)``.
    If ``high`` is ``None``, the interval is ``[0, low)``.
    Supported dtypes: any numpy integers.
    If the size of the interval is a power of 2, a fixed number of counters
    is used in each thread.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """

    if high is None:
        low, high = 0, low + 1
    else:
        assert low < high - 1

    dtype = dtypes.normalize_type(dtype)
    ctype = dtypes.ctype(dtype)

    if dtype.kind == 'i':
        assert low >= -2**(dtype.itemsize * 8 - 1)
        assert high < 2**(dtype.itemsize * 8 - 1)
    else:
        assert low >= 0
        assert high < 2**(dtype.itemsize * 8)

    num = high - low
    if num <= 2**32:
        raw_dtype = numpy.dtype('uint32')
    else:
        raw_dtype = numpy.dtype('uint64')

    raw_func = bijection.raw_functions[raw_dtype]
    max_num = 2**(raw_dtype.itemsize * 8)

    raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype))

    module = Module(TEMPLATE.get_def("uniform_integer"),
                    render_kwds=dict(bijection=bijection,
                                     dtype=dtype,
                                     ctype=ctype,
                                     raw_ctype=raw_ctype,
                                     raw_func=raw_func,
                                     max_num=max_num,
                                     num=num,
                                     low=low))

    return Sampler(bijection,
                   module,
                   dtype,
                   deterministic=(max_num % num == 0))
Esempio n. 2
0
def test_offsets_in_kernel(thr):
    """
    Check that kernels receive the base data of arrays and have to add offsets manually.
    """

    global_size = 100
    dest_offset = 4
    src_offset = 2
    dtype = dtypes.normalize_type(numpy.int32)

    program = thr.compile("""
        KERNEL void test(GLOBAL_MEM int *dest, GLOBAL_MEM int *src)
        {
            const SIZE_T i = get_global_id(0);
            dest[i + ${dest_offset}] = src[i + ${src_offset}];
        }
        """,
        render_kwds=dict(dest_offset=dest_offset, src_offset=src_offset))
    test = program.test

    dest_dev_base = thr.array(global_size + dest_offset, dtype)
    dest_dev = thr.array(
        global_size, dtype, offset=dest_offset * dtype.itemsize, base=dest_dev_base)

    src_base = numpy.arange(global_size + src_offset).astype(dtype)
    src_dev_base = thr.to_device(src_base)
    src_dev = thr.array(global_size, dtype, offset=src_offset * dtype.itemsize, base=src_dev_base)

    test(dest_dev, src_dev, global_size=global_size)
    dest_ref = src_base[src_offset:]

    assert diff_is_negligible(dest_dev.get(), dest_ref)
Esempio n. 3
0
def check_performance(thr_and_double, shape_and_axes):

    thr, double = thr_and_double

    dtype = numpy.complex128 if double else numpy.complex64
    dtype = dtypes.normalize_type(dtype)

    shape, axes = shape_and_axes

    data = numpy.arange(product(shape)).reshape(shape).astype(dtype)

    shift = FFTShift(data, axes=axes)
    shiftc = shift.compile(thr)

    data_dev = thr.to_device(data)
    res_dev = thr.empty_like(data)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        shiftc(res_dev, data_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    res_ref = numpy.fft.fftshift(data, axes=axes)
    assert diff_is_negligible(res_dev.get(), res_ref)

    return min(times), product(shape) * dtype.itemsize
Esempio n. 4
0
def check_performance(thr_and_double, shape_and_axes):

    thr, double = thr_and_double

    dtype = numpy.complex128 if double else numpy.complex64
    dtype = dtypes.normalize_type(dtype)

    shape, axes = shape_and_axes

    data = numpy.arange(product(shape)).reshape(shape).astype(dtype)

    shift = FFTShift(data, axes=axes)
    shiftc = shift.compile(thr)

    data_dev = thr.to_device(data)
    res_dev = thr.empty_like(data)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        shiftc(res_dev, data_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    res_ref = numpy.fft.fftshift(data, axes=axes)
    assert diff_is_negligible(res_dev.get(), res_ref)

    return min(times), product(shape) * dtype.itemsize
Esempio n. 5
0
 def __init__(self, bijection, module, dtype, randoms_per_call=1, deterministic=False):
     """__init__()""" # hide the signature from Sphinx
     self.randoms_per_call = randoms_per_call
     self.dtype = dtypes.normalize_type(dtype)
     self.deterministic = deterministic
     self.bijection = bijection
     self.module = module
Esempio n. 6
0
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None):
    shape = wrap_in_tuple(shape)
    dtype = dtypes.normalize_type(dtype)

    if dtype.names is not None:
        result = numpy.empty(shape, dtype)
        for name in dtype.names:
            result[name] = get_test_array(shape, dtype[name], no_zeros=no_zeros, high=high)
    else:
        if dtypes.is_integer(dtype):
            low = 1 if no_zeros else 0
            if high is None:
                high = 100 # will work even with signed chars
            get_arr = lambda: numpy.random.randint(low, high, shape).astype(dtype)
        else:
            low = 0.01 if no_zeros else 0
            if high is None:
                high = 1.0
            get_arr = lambda: numpy.random.uniform(low, high, shape).astype(dtype)

        if dtypes.is_complex(dtype):
            result = get_arr() + 1j * get_arr()
        else:
            result = get_arr()

    if strides is not None:
        result = as_strided(result, result.shape, strides)

    return result
Esempio n. 7
0
    def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None):
        self.shape = tuple() if shape is None else wrap_in_tuple(shape)
        self.size = product(self.shape)
        self.dtype = dtypes.normalize_type(dtype)
        self.ctype = dtypes.ctype_module(self.dtype)

        default_strides = helpers.default_strides(self.shape,
                                                  self.dtype.itemsize)
        if strides is None:
            strides = default_strides
        else:
            strides = tuple(strides)
        self._default_strides = strides == default_strides
        self.strides = strides

        default_nbytes = helpers.min_buffer_size(self.shape,
                                                 self.dtype.itemsize,
                                                 self.strides)
        if nbytes is None:
            nbytes = default_nbytes
        self._default_nbytes = nbytes == default_nbytes
        self.nbytes = nbytes

        self.offset = offset
        self._cast = dtypes.cast(self.dtype)
Esempio n. 8
0
def test_summation(thr):

    perf_size = 2**22
    dtype = dtypes.normalize_type(numpy.int64)

    a = get_test_array(perf_size, dtype)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(dtype))

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = numpy.array([a.sum()], dtype)

    rdc = rd.compile(thr)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        rdc(b_dev, a_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(b_dev.get(), b_ref)

    return min(times), perf_size * dtype.itemsize
Esempio n. 9
0
    def array(self,
              shape,
              dtype,
              strides=None,
              offset=0,
              nbytes=None,
              allocator=None,
              base=None,
              base_data=None):

        if allocator is None:
            allocator = self.allocate

        dtype = dtypes.normalize_type(dtype)
        shape = wrap_in_tuple(shape)
        if nbytes is None:
            nbytes = min_buffer_size(shape,
                                     dtype.itemsize,
                                     strides=strides,
                                     offset=offset)

        if (offset != 0
                or strides is not None) and base_data is None and base is None:
            base_data = allocator(nbytes)
        elif base is not None:
            base_data = base.data

        return Array(self,
                     shape,
                     dtype,
                     strides=strides,
                     offset=offset,
                     allocator=allocator,
                     base_data=base_data,
                     nbytes=nbytes)
Esempio n. 10
0
def test_summation(thr):

    perf_size = 2 ** 22
    dtype = dtypes.normalize_type(numpy.int64)

    a = get_test_array(perf_size, dtype)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(dtype))

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = numpy.array([a.sum()], dtype)

    rdc = rd.compile(thr)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        rdc(b_dev, a_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(b_dev.get(), b_ref)

    return min(times), perf_size * dtype.itemsize
Esempio n. 11
0
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None):
    shape = wrap_in_tuple(shape)
    dtype = dtypes.normalize_type(dtype)

    if dtype.names is not None:
        result = numpy.empty(shape, dtype)
        for name in dtype.names:
            result[name] = get_test_array(shape,
                                          dtype[name],
                                          no_zeros=no_zeros,
                                          high=high)
    else:
        if dtypes.is_integer(dtype):
            low = 1 if no_zeros else 0
            if high is None:
                high = 100  # will work even with signed chars
            get_arr = lambda: numpy.random.randint(low, high, shape).astype(
                dtype)
        else:
            low = 0.01 if no_zeros else 0
            if high is None:
                high = 1.0
            get_arr = lambda: numpy.random.uniform(low, high, shape).astype(
                dtype)

        if dtypes.is_complex(dtype):
            result = get_arr() + 1j * get_arr()
        else:
            result = get_arr()

    if strides is not None:
        result = as_strided(result, result.shape, strides)

    return result
Esempio n. 12
0
 def padded(cls, dtype, shape, pad=0):
     """
     Creates a :py:class:`Type` object corresponding to an array padded from all dimensions
     by `pad` elements.
     """
     dtype = dtypes.normalize_type(dtype)
     strides, offset, nbytes = helpers.padded_buffer_parameters(shape, dtype.itemsize, pad=pad)
     return cls(dtype, shape, strides=strides, offset=offset, nbytes=nbytes)
Esempio n. 13
0
def test_large_scan_performance(thr, large_perf_shape, exclusive):
    """
    Large problem sizes.
    """
    dtype = dtypes.normalize_type(numpy.int64)
    min_time = check_scan(
        thr, large_perf_shape, dtype=dtype, axes=None, exclusive=exclusive, measure_time=True)
    return min_time, helpers.product(large_perf_shape) * dtype.itemsize
Esempio n. 14
0
def uniform_integer(bijection, dtype, low, high=None):
    """
    Generates uniformly distributed integer numbers in the interval ``[low, high)``.
    If ``high`` is ``None``, the interval is ``[0, low)``.
    Supported dtypes: any numpy integers.
    If the size of the interval is a power of 2, a fixed number of counters
    is used in each thread.
    Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object.
    """

    if high is None:
        low, high = 0, low + 1
    else:
        assert low < high - 1

    dtype = dtypes.normalize_type(dtype)
    ctype = dtypes.ctype(dtype)

    if dtype.kind == 'i':
        assert low >= -2 ** (dtype.itemsize * 8 - 1)
        assert high < 2 ** (dtype.itemsize * 8 - 1)
    else:
        assert low >= 0
        assert high < 2 ** (dtype.itemsize * 8)

    num = high - low
    if num <= 2 ** 32:
        raw_dtype = numpy.dtype('uint32')
    else:
        raw_dtype = numpy.dtype('uint64')

    raw_func = bijection.raw_functions[raw_dtype]
    max_num = 2 ** (raw_dtype.itemsize * 8)

    raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype))

    module = Module(
        TEMPLATE.get_def("uniform_integer"),
        render_kwds=dict(
            bijection=bijection,
            dtype=dtype, ctype=ctype,
            raw_ctype=raw_ctype, raw_func=raw_func,
            max_num=max_num, num=num, low=low))

    return Sampler(bijection, module, dtype, deterministic=(max_num % num == 0))
Esempio n. 15
0
 def padded(cls, dtype, shape, pad=0):
     """
     Creates a :py:class:`Type` object corresponding to an array padded from all dimensions
     by `pad` elements.
     """
     dtype = dtypes.normalize_type(dtype)
     strides, offset, nbytes = helpers.padded_buffer_parameters(
         shape, dtype.itemsize, pad=pad)
     return cls(dtype, shape, strides=strides, offset=offset, nbytes=nbytes)
Esempio n. 16
0
def test_small_scan_performance(thr, exclusive, seq_size):
    """
    Small problem sizes, big batches.
    """
    dtype = dtypes.normalize_type(numpy.complex128)
    shape = (500, 2, 2, 512)
    min_time = check_scan(
        thr, shape, dtype=dtype, axes=(-1,), exclusive=exclusive,
        measure_time=True, seq_size=seq_size)
    return min_time, helpers.product(shape) * dtype.itemsize
Esempio n. 17
0
 def __init__(self, dtype, shape=None, strides=None):
     self.shape = tuple() if shape is None else wrap_in_tuple(shape)
     self.size = product(self.shape)
     self.dtype = dtypes.normalize_type(dtype)
     self.ctype = dtypes.ctype_module(self.dtype)
     if strides is None:
         self.strides = tuple([
             self.dtype.itemsize * product(self.shape[i+1:]) for i in range(len(self.shape))])
     else:
         self.strides = strides
     self._cast = dtypes.cast(self.dtype)
Esempio n. 18
0
def test_large_scan_performance(thr, large_perf_shape, exclusive):
    """
    Large problem sizes.
    """
    dtype = dtypes.normalize_type(numpy.int64)
    min_time = check_scan(thr,
                          large_perf_shape,
                          dtype=dtype,
                          axes=None,
                          exclusive=exclusive,
                          measure_time=True)
    return min_time, helpers.product(large_perf_shape) * dtype.itemsize
Esempio n. 19
0
 def __init__(self,
              bijection,
              module,
              dtype,
              randoms_per_call=1,
              deterministic=False):
     """__init__()"""  # hide the signature from Sphinx
     self.randoms_per_call = randoms_per_call
     self.dtype = dtypes.normalize_type(dtype)
     self.deterministic = deterministic
     self.bijection = bijection
     self.module = module
Esempio n. 20
0
def philox(bitness, counter_words, rounds=10):
    """
    A CBRNG based on a low number of slow rounds (multiplications).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    W_CONSTANTS = {
        64: [
            numpy.uint64(0x9E3779B97F4A7C15),  # golden ratio
            numpy.uint64(0xBB67AE8584CAA73B)  # sqrt(3)-1
        ],
        32: [
            numpy.uint32(0x9E3779B9),  # golden ratio
            numpy.uint32(0xBB67AE85)  # sqrt(3)-1
        ]
    }

    M_CONSTANTS = {
        (64, 2): [numpy.uint64(0xD2B74407B1CE6E93)],
        (64, 4):
        [numpy.uint64(0xD2E7470EE14C6C93),
         numpy.uint64(0xCA5A826395121157)],
        (32, 2): [numpy.uint32(0xD256D193)],
        (32, 4): [numpy.uint32(0xD2511F53),
                  numpy.uint32(0xCD9E8D57)]
    }

    assert 1 <= rounds <= 12
    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness ==
                                       32 else numpy.uint64)
    key_words = counter_words // 2
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(TEMPLATE.get_def("philox"),
                    render_kwds=dict(word_dtype=word_dtype,
                                     word_ctype=dtypes.ctype(word_dtype),
                                     key_words=key_words,
                                     counter_words=counter_words,
                                     key_ctype=key_ctype,
                                     counter_ctype=counter_ctype,
                                     rounds=rounds,
                                     w_constants=W_CONSTANTS[bitness],
                                     m_constants=M_CONSTANTS[(bitness,
                                                              counter_words)]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
Esempio n. 21
0
 def __init__(self, dtype, shape=None, strides=None):
     self.shape = tuple() if shape is None else wrap_in_tuple(shape)
     self.size = product(self.shape)
     self.dtype = dtypes.normalize_type(dtype)
     self.ctype = dtypes.ctype_module(self.dtype)
     if strides is None:
         self.strides = tuple([
             self.dtype.itemsize * product(self.shape[i + 1:])
             for i in range(len(self.shape))
         ])
     else:
         self.strides = strides
     self._cast = dtypes.cast(self.dtype)
Esempio n. 22
0
def test_small_scan_performance(thr, exclusive, seq_size):
    """
    Small problem sizes, big batches.
    """
    dtype = dtypes.normalize_type(numpy.complex128)
    shape = (500, 2, 2, 512)
    min_time = check_scan(thr,
                          shape,
                          dtype=dtype,
                          axes=(-1, ),
                          exclusive=exclusive,
                          measure_time=True,
                          seq_size=seq_size)
    return min_time, helpers.product(shape) * dtype.itemsize
Esempio n. 23
0
def single_kernel_bootstrap_supported(nufhe_params, device_params, raise_exception=False):

    if device_params.api_id == ocl_id():
        # OpenCL uses some local memory for kernel arguments if there are many of them,
        # and we need all the available local memory for internal buffers.
        if raise_exception:
            raise ValueError("Single-kernel bootstrap is not supported for OpenCL")
        else:
            return False

    transform_type = nufhe_params.tgsw_params.tlwe_params.transform_type
    reqs = get_transform(transform_type).transform_module_requirements()

    mask_size = nufhe_params.tgsw_params.tlwe_params.mask_size
    decomp_length = nufhe_params.tgsw_params.decomp_length

    if not (mask_size == 1 and decomp_length == 2):
        if raise_exception:
            raise ValueError(
                "Single-kernel bootstrap is only supported for mask_size=1 and decomp_length=2")
        else:
            return False

    skb_transforms = (mask_size + 1) * decomp_length
    threads_per_transform = reqs['threads_per_transform']
    max_work_group_size = device_params.max_work_group_size
    if not threads_per_transform * skb_transforms <= max_work_group_size:
        if raise_exception:
            raise ValueError(
                "The chosen device does not support a block/workgroup size big enough "
                "to run single-kernel bootstrap")
        else:
            return False

    tr_size = reqs['transform_length'] * reqs['elem_dtype_itemsize']
    temp_size = reqs['temp_length'] * reqs['temp_dtype_itemsize']
    poly_dtype_itemsize = dtypes.normalize_type(Torus32).itemsize
    sh_size = max(tr_size, temp_size)
    required_lmem_size = (
        sh_size * ((mask_size + 1) * decomp_length + mask_size)
        + (mask_size + 1) * reqs['polynomial_length'] * poly_dtype_itemsize)
    if required_lmem_size > device_params.local_mem_size:
        if raise_exception:
            raise ValueError(
                "The chosen device does not have enough shared/local memory "
                "to run single-kernel bootstrap")
        else:
            return False

    return True
Esempio n. 24
0
def philox(bitness, counter_words, rounds=10):
    """
    A CBRNG based on a low number of slow rounds (multiplications).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    W_CONSTANTS = {
        64: [
            numpy.uint64(0x9E3779B97F4A7C15), # golden ratio
            numpy.uint64(0xBB67AE8584CAA73B) # sqrt(3)-1
        ],
        32: [
            numpy.uint32(0x9E3779B9), # golden ratio
            numpy.uint32(0xBB67AE85) # sqrt(3)-1
        ]
    }

    M_CONSTANTS = {
        (64,2): [numpy.uint64(0xD2B74407B1CE6E93)],
        (64,4): [numpy.uint64(0xD2E7470EE14C6C93), numpy.uint64(0xCA5A826395121157)],
        (32,2): [numpy.uint32(0xD256D193)],
        (32,4): [numpy.uint32(0xD2511F53), numpy.uint32(0xCD9E8D57)]
    }

    assert 1 <= rounds <= 12
    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64)
    key_words = counter_words // 2
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(
        TEMPLATE.get_def("philox"),
        render_kwds=dict(
            word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype),
            key_words=key_words, counter_words=counter_words,
            key_ctype=key_ctype, counter_ctype=counter_ctype,
            rounds=rounds, w_constants=W_CONSTANTS[bitness],
            m_constants=M_CONSTANTS[(bitness, counter_words)]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
Esempio n. 25
0
    def array(self,
              shape,
              dtype,
              strides=None,
              offset=0,
              nbytes=None,
              allocator=None,
              base=None,
              base_data=None):

        # In PyCUDA, the default allocator is not None, but a default alloc object
        if allocator is None:
            allocator = cuda.mem_alloc

        dtype = dtypes.normalize_type(dtype)
        shape = wrap_in_tuple(shape)
        if nbytes is None:
            nbytes = int(
                min_buffer_size(shape,
                                dtype.itemsize,
                                strides=strides,
                                offset=offset))

        if (offset != 0
                or strides is not None) and base_data is None and base is None:
            base_data = allocator(nbytes)
        elif base is not None:
            if isinstance(base, Array):
                base_data = base.base_data
            else:
                base_data = base.gpudata

        return Array(self,
                     shape,
                     dtype,
                     strides=strides,
                     allocator=allocator,
                     offset=offset,
                     base_data=base_data,
                     nbytes=nbytes)
Esempio n. 26
0
    def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None):
        self.shape = tuple() if shape is None else wrap_in_tuple(shape)
        self.size = product(self.shape)
        self.dtype = dtypes.normalize_type(dtype)
        self.ctype = dtypes.ctype_module(self.dtype)

        default_strides = helpers.default_strides(self.shape, self.dtype.itemsize)
        if strides is None:
            strides = default_strides
        else:
            strides = tuple(strides)
        self._default_strides = strides == default_strides
        self.strides = strides

        default_nbytes = helpers.min_buffer_size(self.shape, self.dtype.itemsize, self.strides)
        if nbytes is None:
            nbytes = default_nbytes
        self._default_nbytes = nbytes == default_nbytes
        self.nbytes = nbytes

        self.offset = offset
        self._cast = dtypes.cast(self.dtype)
Esempio n. 27
0
def test_offsets_in_kernel(thr):
    """
    Check that kernels receive the base data of arrays and have to add offsets manually.
    """

    global_size = 100
    dest_offset = 4
    src_offset = 2
    dtype = dtypes.normalize_type(numpy.int32)

    program = thr.compile("""
        KERNEL void test(GLOBAL_MEM int *dest, GLOBAL_MEM int *src)
        {
            const SIZE_T i = get_global_id(0);
            dest[i + ${dest_offset}] = src[i + ${src_offset}];
        }
        """,
                          render_kwds=dict(dest_offset=dest_offset,
                                           src_offset=src_offset))
    test = program.test

    dest_dev_base = thr.array(global_size + dest_offset, dtype)
    dest_dev = thr.array(global_size,
                         dtype,
                         offset=dest_offset * dtype.itemsize,
                         base=dest_dev_base)

    src_base = numpy.arange(global_size + src_offset).astype(dtype)
    src_dev_base = thr.to_device(src_base)
    src_dev = thr.array(global_size,
                        dtype,
                        offset=src_offset * dtype.itemsize,
                        base=src_dev_base)

    test(dest_dev, src_dev, global_size=global_size)
    dest_ref = src_base[src_offset:]

    assert diff_is_negligible(dest_dev.get(), dest_ref)
Esempio n. 28
0
    def array(
            self, shape, dtype, strides=None, offset=0, nbytes=None,
            allocator=None, base=None, base_data=None):

        # In PyCUDA, the default allocator is not None, but a default alloc object
        if allocator is None:
            allocator = cuda.mem_alloc

        dtype = dtypes.normalize_type(dtype)
        shape = wrap_in_tuple(shape)
        if nbytes is None:
            nbytes = int(min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset))

        if (offset != 0 or strides is not None) and base_data is None and base is None:
            base_data = allocator(nbytes)
        elif base is not None:
            if isinstance(base, Array):
                base_data = base.base_data
            else:
                base_data = base.gpudata

        return Array(
            self, shape, dtype, strides=strides, allocator=allocator,
            offset=offset, base_data=base_data, nbytes=nbytes)
Esempio n. 29
0
def threefry(bitness, counter_words, rounds=20):
    """
    A CBRNG based on a big number of fast rounds (bit rotations).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    ROTATION_CONSTANTS = {
        # These are the R_256 constants from the Threefish reference sources
        # with names changed to R_64x4...
        (64, 4):
        numpy.array([[14, 52, 23, 5, 25, 46, 58, 32],
                     [16, 57, 40, 37, 33, 12, 22, 32]]).T,

        # Output from skein_rot_search: (srs64_B64-X1000)
        # Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
        # Start: Tue Mar  1 10:07:48 2011
        # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
        (64, 2):
        numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T,
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  8  [  8  8  8  8 ]
        # 6 rounds: minHW = 16  [ 16 16 16 16 ]
        # 7 rounds: minHW = 32  [ 32 32 32 32 ]
        # 8 rounds: minHW = 64  [ 64 64 64 64 ]
        # 9 rounds: minHW = 64  [ 64 64 64 64 ]
        # 10 rounds: minHW = 64  [ 64 64 64 64 ]
        # 11 rounds: minHW = 64  [ 64 64 64 64 ]

        # Output from skein_rot_search: (srs-B128-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Mon Aug 24 22:41:36 2009
        # ...
        # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format
        (32, 4):
        numpy.array([[10, 11, 13, 23, 6, 17, 25, 18],
                     [26, 21, 27, 5, 20, 11, 10, 20]]).T,
        # 4 rounds: minHW =  3  [  3  3  3  3 ]
        # 5 rounds: minHW =  7  [  7  7  7  7 ]
        # 6 rounds: minHW = 12  [ 13 12 13 12 ]
        # 7 rounds: minHW = 22  [ 22 23 22 23 ]
        # 8 rounds: minHW = 31  [ 31 31 31 31 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]

        # Output from skein_rot_search (srs32x2-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Tue Jul 12 11:11:33 2011
        # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format
        (32, 2):
        numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  6  [  6  8  6  8 ]
        # 6 rounds: minHW =  9  [  9 12  9 12 ]
        # 7 rounds: minHW = 16  [ 16 24 16 24 ]
        # 8 rounds: minHW = 32  [ 32 32 32 32 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]
    }

    # Taken from Skein
    PARITY_CONSTANTS = {
        64: numpy.uint64(0x1BD11BDAA9FC1A22),
        32: numpy.uint32(0x1BD11BDA)
    }

    assert 1 <= rounds <= 72

    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness ==
                                       32 else numpy.uint64)
    key_words = counter_words
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(TEMPLATE.get_def("threefry"),
                    render_kwds=dict(
                        word_dtype=word_dtype,
                        word_ctype=dtypes.ctype(word_dtype),
                        key_words=key_words,
                        counter_words=counter_words,
                        key_ctype=key_ctype,
                        counter_ctype=counter_ctype,
                        rounds=rounds,
                        rotation_constants=ROTATION_CONSTANTS[(bitness,
                                                               counter_words)],
                        parity_constant=PARITY_CONSTANTS[bitness]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
Esempio n. 30
0
 def __init__(self, module, dtype, components=1):
     self.module = module
     self.components = components
     self.dtype = dtypes.normalize_type(dtype)
Esempio n. 31
0
 def __init__(self, module, dtype, components=1, noise_sources=1, real_noise=False):
     self.module = module
     self.components = components
     self.noise_sources = noise_sources
     self.real_noise = real_noise
     self.dtype = dtypes.normalize_type(dtype)
Esempio n. 32
0
def threefry(bitness, counter_words, rounds=20):
    """
    A CBRNG based on a big number of fast rounds (bit rotations).

    :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers.
    :param counter_words: ``2`` or ``4``, number of integers generated in one go.
    :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved.
        Default values are big enough to qualify as PRNG.
    :returns: a :py:class:`Bijection` object.
    """

    ROTATION_CONSTANTS = {
        # These are the R_256 constants from the Threefish reference sources
        # with names changed to R_64x4...
        (64, 4): numpy.array([[14, 52, 23, 5, 25, 46, 58, 32], [16, 57, 40, 37, 33, 12, 22, 32]]).T,

        # Output from skein_rot_search: (srs64_B64-X1000)
        # Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
        # Start: Tue Mar  1 10:07:48 2011
        # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
        (64, 2): numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T,
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  8  [  8  8  8  8 ]
        # 6 rounds: minHW = 16  [ 16 16 16 16 ]
        # 7 rounds: minHW = 32  [ 32 32 32 32 ]
        # 8 rounds: minHW = 64  [ 64 64 64 64 ]
        # 9 rounds: minHW = 64  [ 64 64 64 64 ]
        # 10 rounds: minHW = 64  [ 64 64 64 64 ]
        # 11 rounds: minHW = 64  [ 64 64 64 64 ]

        # Output from skein_rot_search: (srs-B128-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Mon Aug 24 22:41:36 2009
        # ...
        # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format
        (32, 4): numpy.array([[10, 11, 13, 23, 6, 17, 25, 18], [26, 21, 27, 5, 20, 11, 10, 20]]).T,
        # 4 rounds: minHW =  3  [  3  3  3  3 ]
        # 5 rounds: minHW =  7  [  7  7  7  7 ]
        # 6 rounds: minHW = 12  [ 13 12 13 12 ]
        # 7 rounds: minHW = 22  [ 22 23 22 23 ]
        # 8 rounds: minHW = 31  [ 31 31 31 31 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]

        # Output from skein_rot_search (srs32x2-X5000.out)
        # Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
        # Start: Tue Jul 12 11:11:33 2011
        # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format
        (32, 2): numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T
        # 4 rounds: minHW =  4  [  4  4  4  4 ]
        # 5 rounds: minHW =  6  [  6  8  6  8 ]
        # 6 rounds: minHW =  9  [  9 12  9 12 ]
        # 7 rounds: minHW = 16  [ 16 24 16 24 ]
        # 8 rounds: minHW = 32  [ 32 32 32 32 ]
        # 9 rounds: minHW = 32  [ 32 32 32 32 ]
        # 10 rounds: minHW = 32  [ 32 32 32 32 ]
        # 11 rounds: minHW = 32  [ 32 32 32 32 ]
    }

    # Taken from Skein
    PARITY_CONSTANTS = {
        64: numpy.uint64(0x1BD11BDAA9FC1A22),
        32: numpy.uint32(0x1BD11BDA)
    }

    assert 1 <= rounds <= 72

    word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64)
    key_words = counter_words
    key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types(
        word_dtype, key_words, counter_words)

    module = Module(
        TEMPLATE.get_def("threefry"),
        render_kwds=dict(
            word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype),
            key_words=key_words, counter_words=counter_words,
            key_ctype=key_ctype, counter_ctype=counter_ctype,
            rounds=rounds, rotation_constants=ROTATION_CONSTANTS[(bitness, counter_words)],
            parity_constant=PARITY_CONSTANTS[bitness]))

    return Bijection(module, word_dtype, key_dtype, counter_dtype)
Esempio n. 33
0
    to_device = (to_device1, to_device2)
    from_device = (from_device1, from_device2, from_device3, from_device4,
                   from_device5)

    for to_d, from_d in itertools.product(to_device, from_device):
        a_device = to_d(a)
        a_copy = thr.copy_array(a_device)
        a_back = from_d(a_copy)
        assert diff_is_negligible(a, a_back)


@pytest.mark.parametrize(
    "dtype",
    TEST_DTYPES,
    ids=[dtypes.normalize_type(dtype).name for dtype in TEST_DTYPES])
def test_dtype_support(thr, dtype):
    # Test passes if either thread correctly reports that it does not support given dtype,
    # or it successfully compiles kernel that operates with this dtype.

    N = 256

    if not thr.device_params.supports_dtype(dtype):
        pytest.skip()

    mul = functions.mul(dtype, dtype)
    div = functions.div(dtype, dtype)
    program = thr.compile("""
    KERNEL void test(
        GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b)
    {
Esempio n. 34
0
        thr.synchronize()
        return y

    to_device = (to_device1, to_device2)
    from_device = (from_device1, from_device2, from_device3, from_device4, from_device5)

    for to_d, from_d in itertools.product(to_device, from_device):
        a_device = to_d(a)
        a_copy = thr.copy_array(a_device)
        a_back = from_d(a_copy)
        assert diff_is_negligible(a, a_back)


@pytest.mark.parametrize(
    "dtype", TEST_DTYPES,
    ids=[dtypes.normalize_type(dtype).name for dtype in TEST_DTYPES])
def test_dtype_support(thr, dtype):
    # Test passes if either thread correctly reports that it does not support given dtype,
    # or it successfully compiles kernel that operates with this dtype.

    N = 256

    if not thr.device_params.supports_dtype(dtype):
        pytest.skip()

    mul = functions.mul(dtype, dtype)
    div = functions.div(dtype, dtype)
    program = thr.compile(
    """
    KERNEL void test(
        GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b)