Example #1
0
def _text_contains_case_sensitive_numba(
    length: int,
    valid_bits: np.ndarray,
    valid_offset: int,
    offsets: np.ndarray,
    data: np.ndarray,
    pat: bytes,
) -> np.ndarray:
    failure_function = compute_kmp_failure_function(pat)

    # Initialise boolean (bit-packaed) output array.
    output_size = length // 8
    if length % 8 > 0:
        output_size += 1
    output = np.empty(output_size, dtype=np.uint8)

    if length % 8 > 0:
        # Zero trailing bits
        output[-1] = 0

    has_nulls = valid_bits.size > 0

    for row_idx in range(length):
        if has_nulls and not _check_valid_row(row_idx, valid_bits,
                                              valid_offset):
            continue

        matched_len = 0
        contains = False
        for str_idx in range(offsets[row_idx], offsets[row_idx + 1]):
            if matched_len == len(pat):
                contains = True
                break

            # Manually inlined utils.kmp.append_to_kmp_matching for
            # performance
            while matched_len > -1 and pat[matched_len] != data[str_idx]:
                matched_len = failure_function[matched_len]
            matched_len = matched_len + 1

        if matched_len == len(pat):
            contains = True

        # Write out the result into the bit-mask
        byte_offset_result = row_idx // 8
        bit_offset_result = row_idx % 8
        mask_result = np.uint8(1 << bit_offset_result)
        current = output[byte_offset_result]
        if contains:  # must be logical, not bit-wise as different bits may be flagged
            output[byte_offset_result] = current | mask_result
        else:
            output[byte_offset_result] = current & ~mask_result

    return output
Example #2
0
def _text_count_case_sensitive_numba(
    length: int,
    valid_bits: np.ndarray,
    valid_offset: int,
    offsets: np.ndarray,
    data: np.ndarray,
    pat: bytes,
) -> np.ndarray:
    failure_function = compute_kmp_failure_function(pat)

    output = np.empty(length, dtype=np.int64)

    has_nulls = valid_bits.size > 0

    for row_idx in range(length):
        if has_nulls and not _check_valid_row(row_idx, valid_bits,
                                              valid_offset):
            continue

        matched_len = 0
        output[row_idx] = 0

        if len(pat) == 0:
            output[row_idx] = offsets[row_idx + 1] - offsets[row_idx] + 1
            continue

        for str_idx in range(offsets[row_idx], offsets[row_idx + 1]):
            # Manually inlined utils.kmp.append_to_kmp_matching for performance
            while matched_len > -1 and pat[matched_len] != data[str_idx]:
                matched_len = failure_function[matched_len]
            matched_len = matched_len + 1

            if matched_len == len(pat):
                output[row_idx] += 1
                # `matched_len=0` ensures overlapping matches are not counted.
                # This matches the behavior of Python's builtin `count`
                # function.
                matched_len = 0

    return output
Example #3
0
def _text_replace_case_sensitive_numba(
    length: int,
    valid_bits: np.ndarray,
    valid_offset: int,
    offsets: np.ndarray,
    data: np.ndarray,
    pat: bytes,
    repl: bytes,
    max_repl: int,
) -> Tuple[np.ndarray, np.ndarray]:

    failure_function = compute_kmp_failure_function(pat)

    # Computes output buffer offsets
    output_offsets = np.empty(length + 1, dtype=np.int32)
    cumulative_offset = 0

    has_nulls = valid_bits.size > 0
    match_len_change = len(repl) - len(pat)

    for row_idx in range(length):
        output_offsets[row_idx] = cumulative_offset

        if has_nulls and not _check_valid_row(row_idx, valid_bits,
                                              valid_offset):
            continue

        row_len = offsets[row_idx + 1] - offsets[row_idx]
        cumulative_offset += row_len

        matched_len = 0
        matches_done = 0

        for str_idx in range(offsets[row_idx], offsets[row_idx + 1]):
            # Manually inlined utils.kmp.append_to_kmp_matching for performance
            while matched_len > -1 and pat[matched_len] != data[str_idx]:
                matched_len = failure_function[matched_len]
            matched_len = matched_len + 1

            if matched_len == len(pat):
                matches_done += 1
                matched_len = 0
                if matches_done == max_repl:
                    break

        cumulative_offset += match_len_change * matches_done

    output_offsets[length] = cumulative_offset

    output_buffer = np.empty(cumulative_offset, dtype=np.uint8)
    output_pos = 0
    for row_idx in range(length):
        if has_nulls and not _check_valid_row(row_idx, valid_bits,
                                              valid_offset):
            continue

        matched_len = 0
        matches_done = 0

        write_idx = offsets[row_idx]
        for read_idx in range(offsets[row_idx], offsets[row_idx + 1]):
            # A modified version of utils.kmp.append_to_kmp_matching
            while matched_len > -1 and pat[matched_len] != data[read_idx]:
                matched_len = failure_function[matched_len]
            matched_len = matched_len + 1

            if read_idx - write_idx == len(pat):
                output_buffer[output_pos] = data[write_idx]
                output_pos += 1
                write_idx += 1

            if matched_len == len(pat):
                matched_len = 0
                if matches_done != max_repl:
                    matches_done += 1
                    write_idx = read_idx + 1

                    for char in repl:
                        output_buffer[output_pos] = char
                        output_pos += 1

        while write_idx < offsets[row_idx + 1]:
            output_buffer[output_pos] = data[write_idx]
            output_pos += 1
            write_idx += 1

    return output_offsets, output_buffer