def buffered_bounded_lemire_uint16(bitgen, rng, bcnt, buf): """ Generates a random unsigned 16 bit integer bounded within a given interval using Lemire's rejection. The buffer acts as storage for a 32 bit integer drawn from the associated BitGenerator so that multiple integers of smaller bitsize can be generated from a single draw of the BitGenerator. """ # Note: `rng` should not be 0xFFFF. When this happens `rng_excl` becomes # zero. rng_excl = uint16(rng) + uint16(1) assert (rng != 0xFFFF) # Generate a scaled random number. n, bcnt, buf = buffered_uint16(bitgen, bcnt, buf) m = uint32(n * rng_excl) # Rejection sampling to remove any bias leftover = m & 0xFFFF if (leftover < rng_excl): # `rng_excl` is a simple upper bound for `threshold`. threshold = ((uint16(UINT16_MAX) - rng) % rng_excl) while (leftover < threshold): n, bcnt, buf = buffered_uint16(bitgen, bcnt, buf) m = uint32(n * rng_excl) leftover = m & 0xFFFF return m >> 16, bcnt, buf
def do_round(a, b, c, d, e, f, g, h, key, word, record): old_a, old_b, old_c, old_d, old_e, old_f, old_g, old_h = a, b, c, d, e, f, g, h tmp = uint32((Sigma1(e) + Ch(e, f, g) + h + key) & mask32bit) preA = uint32((Sigma0(a) + Maj(a, b, c) + tmp) & mask32bit) preE = uint32((tmp + d) & mask32bit) h = g g = f f = e e = (preE + word) & mask32bit d = c c = b b = a a = (preA + word) & mask32bit hamming_distance = 0 if record: hamming_distance += bin(a ^ old_a).count('1') hamming_distance += bin(b ^ old_b).count('1') hamming_distance += bin(c ^ old_c).count('1') hamming_distance += bin(d ^ old_d).count('1') hamming_distance += bin(e ^ old_e).count('1') hamming_distance += bin(f ^ old_f).count('1') hamming_distance += bin(g ^ old_g).count('1') hamming_distance += bin(h ^ old_h).count('1') return a, b, c, d, e, f, g, h, hamming_distance
def packbits(self): packedarray = np.zeros(uint32(self.dimension / 8), dtype=np.uint8) for offset in range(0, self.dimension, 8): packed = 0 bits = self.bitset[offset:offset + 8][::-1] for i in range(8): if bits[i]: packed += 2**i packedarray[uint32(offset / 8)] = packed return packedarray
def fnv1a(seq): """32-bit FNV-1a hash for 32-bit sequences :param seq: signed 32-bit sequence :returns: unsigned 32-bit checksum """ fnv_32_prime = uint32(0x01000193) h = uint32(0x811c9dc5) for s in seq: u = uint32(s) h = (h ^ (u & 0xff)) * fnv_32_prime h = (h ^ ((u >> 8) & 0xff)) * fnv_32_prime h = (h ^ ((u >> 16) & 0xff)) * fnv_32_prime h = (h ^ ((u >> 24) & 0xff)) * fnv_32_prime return h
def setup_const(nh, nto, nn, dt): nh, nn = [nb.uint32(_) for _ in (nh, nn)] dt, pi = [nb.float32(_) for _ in (dt, np.pi)] sqrt_dt = nb.float32(np.sqrt(dt)) o_nh = nb.float32(1 / nh * nto) o_6 = nb.float32(1 / 6) return nh, nn, dt, pi, sqrt_dt, o_nh, o_6
def xoroshiro128p_next(states, index): '''Return the next random uint64 and advance the RNG in states[index]. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: uint64 ''' index = int64(index) s0 = states[index]['s0'] s1 = states[index]['s1'] result = s0 + s1 s1 ^= s0 states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14)) states[index]['s1'] = uint64(rotl(s1, uint32(36))) return result
def generate_random_vector(self): halfdimension = uint32(self.dimension / 2) randvec = np.concatenate( (np.ones(halfdimension, dtype=uint8), np.zeros(halfdimension, dtype=uint8))) np.random.shuffle(randvec) self.bitset = randvec self.votingRecord = randvec.astype(float32) #return randvec, randvec.astype(float32) # This is only 1-2us faster than the above code JIT'd # but either option JIT'd is ~30x faster than straight python return self.bitset, self.votingRecord
def init_xoroshiro128p_state(states, index, seed): '''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed. This ensures that manually set small seeds don't result in a predictable initial sequence from the random number generator. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: uint64 :param index: offset in states to update :type seed: int64 :param seed: seed value to use when initializing state ''' index = int64(index) seed = uint64(seed) z = seed + uint64(0x9E3779B97F4A7C15) z = (z ^ (z >> uint32(30))) * uint64(0xBF58476D1CE4E5B9) z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB) z = z ^ (z >> uint32(31)) states[index]['s0'] = z states[index]['s1'] = z
def parse_int_strtok(s): """ Convert an IP address given as a string to an int, similar to socket.inet_aton(). Performs no error checking! """ result = nb.uint32(0) current = strtok(s, ".") for i in range(4): byte = atoi(current) shift = (3 - i) * 8 result |= byte << shift current = strtok(int_p(nb.NULL), ".") return result
def buffered_bounded_lemire_uint32(bitgen, rng): """ Generates a random unsigned 32 bit integer bounded within a given interval using Lemire's rejection. """ rng_excl = uint32(rng) + uint32(1) assert (rng != 0xFFFFFFFF) # Generate a scaled random number. m = uint64(next_uint32(bitgen)) * uint64(rng_excl) # Rejection sampling to remove any bias leftover = m & 0xFFFFFFFF if (leftover < rng_excl): # `rng_excl` is a simple upper bound for `threshold`. threshold = (UINT32_MAX - rng) % rng_excl while (leftover < threshold): m = uint64(next_uint32(bitgen)) * uint64(rng_excl) leftover = m & 0xFFFFFFFF return (m >> 32)
def drawPolys(screenSize, surface, points, faces, zbuffer, depth): for face in range(faces.shape[0]): if 0 < points[faces[face][0]][2] <= depth: color = uint32(random() * 1000000) triangle = np.empty((3, 3), dtype=np.int32) for i in range(3): triangle[0][i] = points[faces[face][0]][i] for point in range(2, faces.shape[1]): if faces[face][point] < 0: break for i in range(3): triangle[1][i] = points[faces[face][point - 1]][i] triangle[2][i] = points[faces[face][point]][i] if 0 < triangle[1][2] <= depth and 0 < triangle[2][2] <= depth: drawTriangle(screenSize, surface, triangle, color, zbuffer)
def test_4(self): sig = [ int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64), ] func = self.funcs['func3'] A = np.arange(100, dtype=np.float64) self._run_and_compare(func, sig, A, A) A = A.astype(np.float32) self._run_and_compare(func, sig, A, A) A = A.astype(np.int32) self._run_and_compare(func, sig, A, A) A = A.astype(np.uint32) self._run_and_compare(func, sig, A, A)
def parse_int_manual(s): """ Convert an IP address given as a string to an int, similar to socket.inet_aton(). Performs no error checking! """ result = nb.uint32(0) end = len(s) start = 0 shift = 3 for i in range(end): if s[i] == '.'[0] or i == end - 1: byte = atoi(int8_p(s) + start) result |= byte << (shift * 8) shift -= 1 start = i + 1 return result
def _test_template_4(self, target): sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) np_ufunc = np.add def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) self.assertTrue(np.allclose(gold, result)) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def _test_template_4(self, target): sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) np_ufunc = np.add def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) np.testing.assert_allclose(gold, result) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def xoroshiro128p_jump(states, index): '''Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update ''' index = int64(index) s0 = uint64(0) s1 = uint64(0) for i in range(2): for b in range(64): if XOROSHIRO128P_JUMP[i] & (uint64(1) << uint32(b)): s0 ^= states[index]['s0'] s1 ^= states[index]['s1'] xoroshiro128p_next(states, index) states[index]['s0'] = s0 states[index]['s1'] = s1
def template_vectorize(self, target): # build basic native code ufunc sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) # build python ufunc np_ufunc = np.add # test it out def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) self.assertTrue(np.allclose(gold, result)) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def xoroshiro128p_jump(states, index): '''Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update ''' index = int64(index) jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922)) s0 = uint64(0) s1 = uint64(0) for i in range(2): for b in range(64): if jump[i] & (uint64(1) << uint32(b)): s0 ^= states[index]['s0'] s1 ^= states[index]['s1'] xoroshiro128p_next(states, index) states[index]['s0'] = s0 states[index]['s1'] = s1
def xoroshiro128p_jump(states, index): """Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update """ index = int64(index) jump = (uint64(0xBEAC0467EBA5FACB), uint64(0xD86B048B86AA9922)) s0 = uint64(0) s1 = uint64(0) for i in range(2): for b in range(64): if jump[i] & (uint64(1) << uint32(b)): s0 ^= states[index]["s0"] s1 ^= states[index]["s1"] xoroshiro128p_next(states, index) states[index]["s0"] = s0 states[index]["s1"] = s1
def template_vectorize(self, target): # build basic native code ufunc sig = [ int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64) ] basic_ufunc = vectorize(sig, target=target)(vector_add) # build python ufunc np_ufunc = np.add # test it out def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) self.assertTrue(np.allclose(gold, result)) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def uint_int_div_ary(elts, normdist, seed): for i in xrange(elts.shape[0]): # Problem with using sext instead of zext for uint32 elt = (seed[i] // uint32(normdist.shape[0])) elts[i] = elt
import numpy as _np import abc import numba _HW_LUT = _np.array([0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8], dtype='uint32') @numba.vectorize([numba.uint32(numba.uint8)]) def _fhw8(x): return _HW_LUT[x] @numba.vectorize([numba.uint32(numba.uint16)]) def _fhw16(x): return _HW_LUT[x & 0x00ff] + _HW_LUT[x >> 8] @numba.vectorize([numba.uint32(numba.uint32)]) def _fhw32(x): r = 0 for _ in range(4): r += _HW_LUT[x & 0x000000ff] x >>= 8 return r
def uint64_to_unit_float64(x): '''Convert uint64 to float64 value in the range [0.0, 1.0)''' x = uint64(x) return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
def rk4_rV_wrapper(nrV, rti, Vti, o_tau, pi, tau, Delta, eta, J, I, cr, rc, cv, Vc, r_sigma, V_sigma, z0, z1): rk4_rV(nb.uint32(0), nrV, rti, Vti, o_tau, pi, tau, Delta, eta, J, I, cr, rc, cv, Vc, r_sigma, V_sigma, z0, z1)
def rotl(x, k): '''Left rotate x by k bits.''' x = uint64(x) k = uint32(k) return (x << k) | (x >> uint32(64 - k))
import numba as nb import numpy as np from numba import prange a = np.array([[0, 1, 1, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0]], dtype='b1') Numba2dBooleanArray = nb.typeof(a) print(a.dtype) print(Numba2dBooleanArray) @nb.njit(nb.uint32(Numba2dBooleanArray), fastmath=True) def largest_cross(a): r, l = a.shape # optimize for memory usage by setting the type left = np.zeros(a.shape, dtype=nb.uint32) right = np.zeros(a.shape, dtype=nb.uint32) top = np.zeros(a.shape, dtype=nb.uint32) bottom = np.zeros(a.shape, dtype=nb.uint32) for i in range(r): for j in range(1, l): if a[i][j - 1]: left[i][j] = left[i][j - 1] + 1 for i in range(r): for j in range(l - 1, -1, -1): if a[i][j + 1]:
def impl(time, interval, antenna1, antenna2, flag_row=None, time_bin_secs=1): ubl, _, bl_inv, _ = unique_baselines(antenna1, antenna2) utime, _, time_inv, _ = unique_time(time) nbl = ubl.shape[0] ntime = utime.shape[0] sentinel = np.finfo(time.dtype).max out_rows = numba.uint32(0) scratch = np.full(3 * nbl * ntime, -1, dtype=np.int32) row_lookup = scratch[:nbl * ntime].reshape(nbl, ntime) bin_lookup = scratch[nbl * ntime:2 * nbl * ntime].reshape(nbl, ntime) inv_argsort = scratch[2 * nbl * ntime:] time_lookup = np.zeros((nbl, ntime), dtype=time.dtype) interval_lookup = np.zeros((nbl, ntime), dtype=interval.dtype) bin_flagged = np.zeros((nbl, ntime), dtype=np.bool_) # Create a mapping from the full bl x time resolution back # to the original input rows for r in range(time.shape[0]): bl = bl_inv[r] t = time_inv[r] row_lookup[bl, t] = r # Average times over each baseline and construct the # bin_lookup and time_lookup arrays for bl in range(ubl.shape[0]): tbin = numba.int32(0) bin_count = numba.int32(0) bin_flag_count = numba.int32(0) bin_low = time.dtype.type(0) for t in range(utime.shape[0]): # Lookup input row r = row_lookup[bl, t] # Ignore if not present if r == -1: continue # At this point, we decide whether to contribute to # the current bin, or create a new one. We don't add # the current sample to the current bin if # high - low >= time_bin_secs half_int = interval[r] * 0.5 # We're starting a new bin anyway, # just set the lower bin value if bin_count == 0: bin_low = time[r] - half_int # If we exceed the seconds in the bin, # normalise the time and start a new bin elif time[r] + half_int - bin_low > time_bin_secs: # Normalise and flag the bin # if total counts match flagged counts if bin_count > 0: time_lookup[bl, tbin] /= bin_count bin_flagged[bl, tbin] = bin_count == bin_flag_count # There was nothing in the bin else: time_lookup[bl, tbin] = sentinel bin_flagged[bl, tbin] = False tbin += 1 bin_count = 0 bin_flag_count = 0 # Record the output bin associated with the row bin_lookup[bl, t] = tbin # Time + Interval take unflagged + unflagged # samples into account (nominal value) time_lookup[bl, tbin] += time[r] interval_lookup[bl, tbin] += interval[r] bin_count += 1 # Record flags if is_flagged_fn(flag_row, r): bin_flag_count += 1 # Normalise the last bin if it has entries in it if bin_count > 0: time_lookup[bl, tbin] /= bin_count bin_flagged[bl, tbin] = bin_count == bin_flag_count tbin += 1 # Add this baseline's number of bins to the output rows out_rows += tbin # Set any remaining bins to sentinel value and unflagged for b in range(tbin, ntime): time_lookup[bl, b] = sentinel bin_flagged[bl, b] = False # Flatten the time lookup and argsort it flat_time = time_lookup.ravel() flat_int = interval_lookup.ravel() argsort = np.argsort(flat_time, kind='mergesort') # Generate lookup from flattened (bl, time) to output row for i, a in enumerate(argsort): inv_argsort[a] = i # Construct the final row map row_map = np.empty((time.shape[0]), dtype=np.uint32) # Construct output flag row, if necessary out_flag_row = output_flag_row(out_rows, flag_row) # foreach input row for in_row in range(time.shape[0]): # Lookup baseline and time bl = bl_inv[in_row] t = time_inv[in_row] # lookup time bin and output row tbin = bin_lookup[bl, t] # lookup output row in inv_argsort out_row = inv_argsort[bl * ntime + tbin] if out_row >= out_rows: raise RowMapperError("out_row >= out_rows") # Handle output row flagging set_flag_row(flag_row, in_row, out_flag_row, out_row, bin_flagged[bl, tbin]) row_map[in_row] = out_row time_ret = flat_time[argsort[:out_rows]] int_ret = flat_int[argsort[:out_rows]] return RowMapOutput(row_map, time_ret, int_ret, out_flag_row)
import numpy as np import unittest from numba import void, int32, uint32, jit, int64 @jit(void(uint32[:], uint32, uint32)) def prng(X, A, C): for i in range(X.shape[0]): for j in range(100): v = (A * X[i] + C) X[i] = v & 0xffffffff @jit(uint32()) def unsigned_literal(): return abs(0xFFFFFFFF) @jit(int64()) def unsigned_literal_64(): return 0x100000000 @jit(int64(int32)) def constant_int_add(a): return 0xffffffff + a class Test(unittest.TestCase): def test_prng(self): N = 100 A = 1664525 C = 1013904223 X0 = np.arange(N, dtype=np.uint32) X1 = X0.copy()
from quspin.basis import spinless_fermion_basis_1d # Hilbert space spin basis_1d from quspin.basis.user import user_basis # Hilbert space user basis from quspin.basis.user import next_state_sig_32, op_sig_32, map_sig_32, count_particles_sig_32 # user basis data types signatures from numba import carray, cfunc, jit # numba helper functions from numba import uint32, int32 # numba data types import numpy as np from scipy.special import comb # N = 8 # lattice sites Np = N // 2 # total number of fermions # ############ create soinless fermion user basis object ############# # @jit(uint32(uint32, uint32), locals=dict(f_count=uint32, ), nopython=True, nogil=True) def _count_particles_32(state, site_ind): # auxiliary function to count number of fermions, i.e. 1's in bit configuration of the state, up to site site_ind # CAUTION: 32-bit integers code only! f_count = state & ((0x7FFFFFFF) >> (31 - site_ind)) f_count = f_count - ((f_count >> 1) & 0x55555555) f_count = (f_count & 0x33333333) + ((f_count >> 2) & 0x33333333) return (((f_count + (f_count >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24 # @cfunc( op_sig_32,
import numpy as np import cmath from pylab import imshow, show from timeit import default_timer as timer from numba import cuda from numba import uint32, f8, uint16 @cuda.jit(uint32(f8, f8, uint32), device=True) def mandel(x, y, max_iters): c = complex(x, y) z = 0j for i in range(max_iters): z = cmath.sin(z) + c if (z.real * z.real + z.imag * z.imag) >= 10 * np.pi: return i return max_iters @cuda.jit((f8, f8, f8, f8, uint16[:, :], uint32)) def mandel_kernel(min_x, max_x, min_y, max_y, image, max_iters): height = image.shape[0] width = image.shape[1] pixel_size_x = (max_x - min_x) / width pixel_size_y = (max_y - min_y) / height startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y for x in range(startX, width, gridX):
def Ch(ee, ff, gg): return (ee & ff) ^ (uint32(~ee & mask32bit) & gg)
def Ch(ee, ff, gg): return (ee & ff) ^ (uint32(~ee & mask32bit) & gg) def Maj(aa, bb, cc): return (aa & bb) ^ (aa & cc) ^ (bb & cc)
import numpy as np import math from imageio import imread, imwrite import sys import cProfile import time from numba import njit, uint32, float32, int8, int32, uint8, int64, types, config # from .plot_mv import plot_vector_field @njit(uint32(uint8[:, :, :], uint8[:, :, :]), cache=True) def get_sad(source_block, target_block): source_block = source_block.astype(np.float32) target_block = target_block.astype(np.float32) source_block = 0.299 * source_block[:, :, 0] + 0.587 * source_block[:, :, 1] + 0.114 * source_block[:, :, 2] target_block = 0.299 * target_block[:, :, 0] + 0.587 * target_block[:, :, 1] + 0.114 * target_block[:, :, 2] return (np.sum(np.abs(np.subtract(source_block, target_block)))) @njit(float32[:, :, :](int32, int32, types.UniTuple(int32, 3), uint8[:, :, :], uint8[:, :, :]), cache=True) def helper(block_size, steps, frame_shape, source_frame_pad, target_frame_pad): output = np.zeros(frame_shape, dtype=np.float32) prec_dic = [1, 2, 1, 2, 3, 2, 1, 2, 1]