def _convert_in2ind_inplace(id2ind, ind2id, id2ind_len, arr, allow_missing): """Convert IDs to indices in-place. id2ind is a Numpy array: a hash map mapping IDs to candidate indices; id2ind_len is its length. ind2id maps indices to their IDs. arr is the input/output array. When allow_missing is True, unrecognised IDs do not raise, but are instead replaced with SENTINEL. """ for i, id_ in enumerate(arr): hash_ = nb.u4(id_ * nb.u4(0x9e3779b1)) % id2ind_len while True: index = id2ind[hash_] if index == SENTINEL: if not allow_missing: raise KeyError('id not found') arr[i] = SENTINEL break # id2ind[hash_] might be the correct index, but this is not # guaranteed due to collisions. Look it up in ind2id to make # sure. if ind2id[index] == id_: arr[i] = index break hash_ += 1 # id2ind[hash_] was not correct. Try next cell. if hash_ >= id2ind_len: hash_ = 0 # Went past the end; wrap around.
def get_u32(buf, offset, length): if length < 4: return (0, offset, length) a = nb.u4(buf[offset + 3]) << 24 b = nb.u4(buf[offset + 2]) << 16 c = nb.u4(buf[offset + 1]) << 8 d = nb.u4(buf[offset + 0]) << 0 return a | b | c | d, offset + 4, length - 4
def convolve(signal, ref, window, result): smem = cuda.shared.array(0, f8) i, j = cuda.grid(2) S = signal.size W = window.size R = ref.shape[0] Bix = cuda.blockIdx.x # Block index along the x dimension -> indexing the signal BDx = cuda.blockDim.x # Number of threads along x -> Many things tix = cuda.threadIdx.x # x thread id within block [0,blockdim.x) -> indexing the window tiy = cuda.threadIdx.y # y thread id within block [0,blockdim.y) -> indexing of memory tif = tix + tiy * BDx # thread index within a block (flat) -> indexing lines and shared memory index = j + tix # reference and signal index value = f8(0) if (tix < W) & (index < S): value = window[tix] * (ref[R, index] * signal[index]) value = reduce_warp(value, u4(0xffffffff)) # Reduced sum should be present in the value of all threads with lane index == 0 # Store the warp reduction in the shared memory if tif % 32 == 0: # For all threads with lane index == 0 smem[tif // 32] = value # Flat warp id cuda.syncthreads() # When the blocksize is smaller than a single warp (32), we are done. # In this case we can be very specific about the locations we need if (BDx <= 32) and (tix == 0): result[Bix, j] = smem[tiy] # Otherwise, take values from the shared memory van reduce. # NOTE: maximum number of threads is 1024 which is 32 times a warp (consisting of 32 threads) # This means, the warp reductions of 32 warps, fit baxck into a single warp. # Disperse the reduction values from the memory over the first threads along the x direction. # All others become 0 Nwx = (BDx - 1) // 32 + 1 if (tix < BDx // 32): values = smem[tix + Nwx * tiy] else: values = 0 # Perhaps its better to put the index definition outside the if-else block and remove this barrier cuda.syncthreads() # All threads in a first warp along x if tix // 32 == 0: value = reduce_warp(value, u4(0xffffffff)) cuda.syncthreads() if (tix == 0) and (j < S): result[Bix, j] = value
def _get_string(buf_in, offset, length, check_null): buf_out = np.zeros(256, dtype=np.uint8) i = nb.u4(0) null_term = False while i < length: if check_null and buf_in[offset + i] == 0: null_term = True break buf_out[i] = buf_in[offset + i] i = nb.u4(i + nb.u4(1)) if null_term: return buf_out[:i], offset + i + 1, i + 1 else: return buf_out[:i], offset + i, i
def _make_hash_map(ids, id2ind, arr_size, offset): """Place IDs in the hash map. ids is a NumPy array of IDs, ordered by the index. id2ind is the target array; arr_size is its size. offset is a constant added to the indices. """ for i, id_ in enumerate(ids): # Pretty bad hashing method (by Knuth), but our IDs are already # almost uniform, they just need a little bit of help :) hash_ = nb.u4(id_ * nb.u4(0x9e3779b1)) % arr_size while id2ind[hash_] != SENTINEL: # Cell not free. hash_ += 1 if hash_ >= arr_size: hash_ = 0 # Gone past the end; wraparound id2ind[hash_] = i + offset
if tree_ops[i]: # ADDITION: 1 # print('value[{}] = {} + {}'.format(target, value_array[target], value_array[source1])) # value_array[target] = value_array[target] + value_array[source1] value_array[target] = value_array[target] + value_array[ tree_recipe[i, 1]] else: # print('value[{}] = {} * {}'.format(target, value_array[target], value_array[source1])) # value_array[target] = value_array[target] * value_array[source1] value_array[target] = value_array[target] * value_array[ tree_recipe[i, 1]] # the value at the first position is the value of the polynomial return value_array[0] @jit(u4(u4[:]), nopython=True, cache=True) def num_ops_1D_horner(unique_exponents): """ :param unique_exponents: np array of unique exponents sorted in increasing order without 0 :return: the number of operations of the one dimensional horner factorisation without counting additions (just MUL & POW) and without considering the coefficients NOTE: in 1D the horner factorisation is both unique and optimal (minimal amount of operations) """ nr_unique_exponents = unique_exponents.shape[0] # the exponent 0 is not present! assert not np.any(unique_exponents == 0) if nr_unique_exponents == 0: return 0
@njit(nogil=True, fastmath=True, cache=True) def fnv(data, hval_init, fnv_prime, fnv_size): """ Core FNV hash algorithm used in FNV0 and FNV1. """ hval = hval_init for i in range(len(data)): byte = data[i] hval = (hval * fnv_prime) % fnv_size hval = hval ^ byte return hval @njit(u4(u1[:]), nogil=True, fastmath=True, cache=True) def fnv0_32(data): """ Returns the 32 bit FNV-0 hash value for the given data. """ return fnv(data, FNV0_32_INIT, FNV_32_PRIME, 2**32) # @njit(u8(u1[:]),nogil=True,fastmath=True,cache=True) # def fnv0_64(data): # """ # Returns the 64 bit FNV-0 hash value for the given data. # """ # return fnv(data, FNV0_64_INIT, FNV_64_PRIME, 2**64)