Esempio n. 1
0
def _convert_in2ind_inplace(id2ind, ind2id, id2ind_len, arr, allow_missing):
    """Convert IDs to indices in-place.

    id2ind is a Numpy array: a hash map mapping IDs to candidate
    indices; id2ind_len is its length. ind2id maps indices to their IDs.
    arr is the input/output array. When allow_missing is True,
    unrecognised IDs do not raise, but are instead replaced with
    SENTINEL.
    """
    for i, id_ in enumerate(arr):
        hash_ = nb.u4(id_ * nb.u4(0x9e3779b1)) % id2ind_len
        while True:
            index = id2ind[hash_]
            if index == SENTINEL:
                if not allow_missing:
                    raise KeyError('id not found')
                arr[i] = SENTINEL
                break
            # id2ind[hash_] might be the correct index, but this is not
            # guaranteed due to collisions. Look it up in ind2id to make
            # sure.
            if ind2id[index] == id_:
                arr[i] = index
                break
            hash_ += 1  # id2ind[hash_] was not correct. Try next cell.
            if hash_ >= id2ind_len:
                hash_ = 0  # Went past the end; wrap around.
Esempio n. 2
0
def get_u32(buf, offset, length):
    if length < 4:
        return (0, offset, length)
    a = nb.u4(buf[offset + 3]) << 24
    b = nb.u4(buf[offset + 2]) << 16
    c = nb.u4(buf[offset + 1]) << 8
    d = nb.u4(buf[offset + 0]) << 0
    return a | b | c | d, offset + 4, length - 4
def convolve(signal, ref, window, result):
    smem = cuda.shared.array(0, f8)

    i, j = cuda.grid(2)
    S = signal.size
    W = window.size
    R = ref.shape[0]

    Bix = cuda.blockIdx.x  # Block index along the x dimension       -> indexing the signal
    BDx = cuda.blockDim.x  # Number of threads along x               -> Many things
    tix = cuda.threadIdx.x  # x thread id within block [0,blockdim.x) -> indexing the window
    tiy = cuda.threadIdx.y  # y thread id within block [0,blockdim.y) -> indexing of memory
    tif = tix + tiy * BDx  # thread index within a block (flat)      -> indexing lines and shared memory

    index = j + tix  # reference and signal index

    value = f8(0)
    if (tix < W) & (index < S):
        value = window[tix] * (ref[R, index] * signal[index])
    value = reduce_warp(value, u4(0xffffffff))
    # Reduced sum should be present in the value of all threads with lane index == 0

    # Store the warp reduction in the shared memory
    if tif % 32 == 0:  # For all threads with lane index == 0
        smem[tif // 32] = value  # Flat warp id
    cuda.syncthreads()

    # When the blocksize is smaller than a single warp (32), we are done.
    # In this case we can be very specific about the locations we need
    if (BDx <= 32) and (tix == 0):
        result[Bix, j] = smem[tiy]

    # Otherwise, take values from the shared memory van reduce.
    # NOTE: maximum number of threads is 1024 which is 32 times a warp (consisting of 32 threads)
    # This means, the warp reductions of 32 warps, fit baxck into a single warp.

    # Disperse the reduction values from the memory over the first threads along the x direction.
    # All others become 0
    Nwx = (BDx - 1) // 32 + 1
    if (tix < BDx // 32):
        values = smem[tix + Nwx * tiy]
    else:
        values = 0
    # Perhaps its better to put the index definition outside the if-else block and remove this barrier
    cuda.syncthreads()

    # All threads in a first warp along x
    if tix // 32 == 0:
        value = reduce_warp(value, u4(0xffffffff))
    cuda.syncthreads()

    if (tix == 0) and (j < S):
        result[Bix, j] = value
Esempio n. 4
0
def _get_string(buf_in, offset, length, check_null):
    buf_out = np.zeros(256, dtype=np.uint8)
    i = nb.u4(0)
    null_term = False
    while i < length:
        if check_null and buf_in[offset + i] == 0:
            null_term = True
            break
        buf_out[i] = buf_in[offset + i]
        i = nb.u4(i + nb.u4(1))
    if null_term:
        return buf_out[:i], offset + i + 1, i + 1
    else:
        return buf_out[:i], offset + i, i
Esempio n. 5
0
def _make_hash_map(ids, id2ind, arr_size, offset):
    """Place IDs in the hash map.

    ids is a NumPy array of IDs, ordered by the index. id2ind is the
    target array; arr_size is its size. offset is a constant added to
    the indices.
    """
    for i, id_ in enumerate(ids):
        # Pretty bad hashing method (by Knuth), but our IDs are already
        # almost uniform, they just need a little bit of help :)
        hash_ = nb.u4(id_ * nb.u4(0x9e3779b1)) % arr_size
        while id2ind[hash_] != SENTINEL:  # Cell not free.
            hash_ += 1
            if hash_ >= arr_size:
                hash_ = 0  # Gone past the end; wraparound
        id2ind[hash_] = i + offset
        if tree_ops[i]:  # ADDITION: 1
            # print('value[{}] = {} + {}'.format(target, value_array[target], value_array[source1]))
            # value_array[target] = value_array[target] + value_array[source1]
            value_array[target] = value_array[target] + value_array[
                tree_recipe[i, 1]]
        else:
            # print('value[{}] = {} * {}'.format(target, value_array[target], value_array[source1]))
            # value_array[target] = value_array[target] * value_array[source1]
            value_array[target] = value_array[target] * value_array[
                tree_recipe[i, 1]]

    # the value at the first position is the value of the polynomial
    return value_array[0]


@jit(u4(u4[:]), nopython=True, cache=True)
def num_ops_1D_horner(unique_exponents):
    """
    :param unique_exponents: np array of unique exponents sorted in increasing order without 0
    :return: the number of operations of the one dimensional horner factorisation
        without counting additions (just MUL & POW) and without considering the coefficients


    NOTE: in 1D the horner factorisation is both unique and optimal (minimal amount of operations)
    """
    nr_unique_exponents = unique_exponents.shape[0]
    # the exponent 0 is not present!
    assert not np.any(unique_exponents == 0)

    if nr_unique_exponents == 0:
        return 0
Esempio n. 7
0

@njit(nogil=True, fastmath=True, cache=True)
def fnv(data, hval_init, fnv_prime, fnv_size):
    """
    Core FNV hash algorithm used in FNV0 and FNV1.
    """
    hval = hval_init
    for i in range(len(data)):
        byte = data[i]
        hval = (hval * fnv_prime) % fnv_size
        hval = hval ^ byte
    return hval


@njit(u4(u1[:]), nogil=True, fastmath=True, cache=True)
def fnv0_32(data):
    """
    Returns the 32 bit FNV-0 hash value for the given data.
    """
    return fnv(data, FNV0_32_INIT, FNV_32_PRIME, 2**32)


# @njit(u8(u1[:]),nogil=True,fastmath=True,cache=True)
# def fnv0_64(data):
#     """
#     Returns the 64 bit FNV-0 hash value for the given data.
#     """
#     return fnv(data, FNV0_64_INIT, FNV_64_PRIME, 2**64)