def find_near_matches_generic(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    # if the limitations are so strict that only exact matches are allowed,
    # use search_exact()
    if search_params.max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    # if the n-gram length would be at least 3, use the n-gram search method
    elif len(subsequence) // (search_params.max_l_dist + 1) >= 3:
        return find_near_matches_generic_ngrams(subsequence, sequence, search_params)

    # use the linear programming search method
    else:
        return find_near_matches_generic_linear_programming(subsequence, sequence, search_params)
Beispiel #2
0
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')
    if max_l_dist < 0:
        raise ValueError('Maximum Levenshtein distance must be >= 0!')

    if max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0,
                  sequence[start_index:start_index + len(subsequence)])
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_l_dist + 1) >= 3:
        return find_near_matches_levenshtein_ngrams(subsequence,
                                                    sequence,
                                                    max_l_dist)

    else:
        return find_near_matches_levenshtein_linear_programming(subsequence,
                                                                sequence,
                                                                max_l_dist)
Beispiel #3
0
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')
    if max_l_dist < 0:
        raise ValueError('Maximum Levenshtein distance must be >= 0!')

    if max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_l_dist + 1) >= 3:
        return find_near_matches_levenshtein_ngrams(subsequence,
                                                    sequence,
                                                    max_l_dist)

    else:
        return find_near_matches_levenshtein_linear_programming(subsequence,
                                                                sequence,
                                                                max_l_dist)
def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    _check_arguments(subsequence, sequence, max_substitutions)

    if max_substitutions == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_substitutions + 1) >= 3:
        return find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions,
        )

    else:
        return find_near_matches_substitutions_lp(
            subsequence, sequence, max_substitutions,
        )
Beispiel #5
0
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                            max_substitutions):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    def make_match(start, end, dist):
        return Match(start, end, dist, matched=sequence[start:end])

    ngram_len = subseq_len // (max_substitutions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_substitutions!")

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_after = subsequence[ngram_end:]
        for index in search_exact(
                subsequence[ngram_start:ngram_end],
                sequence,
                ngram_start,
                seq_len - (subseq_len - ngram_end),
        ):
            n_substitutions = 0
            seq_before = sequence[index - ngram_start:index]
            if subseq_before != seq_before:
                n_substitutions += count_differences_with_maximum(
                    seq_before, subseq_before,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            seq_after = sequence[index + ngram_len:index - ngram_start +
                                 subseq_len]
            if subseq_after != seq_after:
                if n_substitutions == max_substitutions:
                    continue
                n_substitutions += count_differences_with_maximum(
                    seq_after, subseq_after,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            yield make_match(
                start=index - ngram_start,
                end=index - ngram_start + subseq_len,
                dist=n_substitutions,
            )
def has_near_match_substitutions(subsequence, sequence, max_substitutions):
    _check_arguments(subsequence, sequence, max_substitutions)

    if max_substitutions == 0:
        for start_index in search_exact(subsequence, sequence):
            return True
        return False

    elif len(subsequence) // (max_substitutions + 1) >= 3:
        return has_near_match_substitutions_ngrams(
            subsequence, sequence, max_substitutions,
        )

    else:
        return has_near_match_substitutions_lp(
            subsequence, sequence, max_substitutions,
        )
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_l_dist + 1)
    if ngram_len == 0:
        raise ValueError(
            'the subsequence length must be greater than max_l_dist')

    def make_match(start, end, dist):
        return Match(start, end, dist, matched=sequence[start:end])

    for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before_reversed = subsequence[:ngram_start][::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_l_dist)
        end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
        for index in search_exact(subsequence[ngram_start:ngram_end], sequence,
                                  start_index, end_index):
            # try to expand left and/or right according to n_ngram
            dist_right, right_expand_size = _expand(
                subseq_after,
                sequence[index + ngram_len:index - ngram_start + subseq_len +
                         max_l_dist],
                max_l_dist,
            )
            if dist_right is None:
                continue
            dist_left, left_expand_size = _expand(
                subseq_before_reversed,
                sequence[max(0, index - ngram_start -
                             (max_l_dist - dist_right)):index][::-1],
                max_l_dist - dist_right,
            )
            if dist_left is None:
                continue
            assert dist_left + dist_right <= max_l_dist

            yield make_match(
                start=index - left_expand_size,
                end=index + ngram_len + right_expand_size,
                dist=dist_left + dist_right,
            )
Beispiel #8
0
def find_near_matches_generic_ngrams(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    max_l_dist = search_params.max_l_dist

    # optimization: prepare some often used things in advance
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_l_dist + 1)
    if ngram_len == 0:
        raise ValueError(
            'the subsequence length must be greater than max_l_dist')

    for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        start_index = max(0, ngram_start - max_l_dist)
        end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
        for index in search_exact(subsequence[ngram_start:ngram_end], sequence,
                                  start_index, end_index):
            # try to expand left and/or right according to n_ngram
            for match in find_near_matches_generic_linear_programming(
                    subsequence,
                    sequence[max(0, index - ngram_start - max_l_dist):index -
                             ngram_start + subseq_len + max_l_dist],
                    search_params,
            ):
                yield attr.evolve(
                    match,
                    start=match.start +
                    max(0, index - ngram_start - max_l_dist),
                    end=match.end + max(0, index - ngram_start - max_l_dist),
                )
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                            max_substitutions):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_substitutions!"
        )

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_after = subsequence[ngram_end:]
        for index in search_exact(
                subsequence[ngram_start:ngram_end], sequence,
                ngram_start, seq_len - (subseq_len - ngram_end),
        ):
            n_substitutions = 0
            seq_before = sequence[index - ngram_start:index]
            if subseq_before != seq_before:
                n_substitutions += count_differences_with_maximum(
                    seq_before, subseq_before,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len]
            if subseq_after != seq_after:
                if n_substitutions == max_substitutions:
                    continue
                n_substitutions += count_differences_with_maximum(
                    seq_after, subseq_after,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            yield Match(
                start=index - ngram_start,
                end=index - ngram_start + subseq_len,
                dist=n_substitutions,
            )
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_l_dist + 1)
    if ngram_len == 0:
        raise ValueError('the subsequence length must be greater than max_l_dist')

    for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before_reversed = subsequence[:ngram_start][::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_l_dist)
        end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
        for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index):
            # try to expand left and/or right according to n_ngram
            dist_right, right_expand_size = _expand(
                subseq_after,
                sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist],
                max_l_dist,
            )
            if dist_right is None:
                continue
            dist_left, left_expand_size = _expand(
                subseq_before_reversed,
                sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1],
                max_l_dist - dist_right,
            )
            if dist_left is None:
                continue
            assert dist_left + dist_right <= max_l_dist

            yield Match(
                start=index - left_expand_size,
                end=index + ngram_len + right_expand_size,
                dist=dist_left + dist_right,
            )
 def search(self, subsequence, sequence, start_index=0, end_index=None):
     return list(search_exact(subsequence, sequence, start_index, end_index))
Beispiel #12
0
def find_near_matches_no_deletions_ngrams(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * no deletions are allowed
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked

    max_substitutions = min(max_substitutions, max_l_dist)
    max_insertions = min(max_insertions, max_l_dist)

    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + max_insertions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_subs + max_ins!"
        )

    matches = []
    matched_indexes = set()

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_before_reversed = subseq_before[::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_insertions)
        end_index = min(seq_len, seq_len - (subseq_len - ngram_end) + max_insertions)

        for index in search_exact(
                subsequence[ngram_start:ngram_end], sequence,
                start_index, end_index,
        ):
            if index - ngram_start in matched_indexes:
                continue

            seq_after = sequence[index + ngram_len:index + subseq_len - ngram_start + max_insertions]
            if seq_after.startswith(subseq_after):
                matches_after = [(0, 0)]
            else:
                matches_after = _expand(subseq_after, seq_after,
                                        max_substitutions, max_insertions, max_l_dist)
                if not matches_after:
                    continue

            _max_substitutions = max_substitutions - min(m[0] for m in matches_after)
            _max_insertions = max_insertions - min(m[1] for m in matches_after)
            _max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after)
            seq_before = sequence[index - ngram_start - _max_insertions:index]
            if seq_before.endswith(subseq_before):
                matches_before = [(0, 0)]
            else:
                matches_before = _expand(
                    subseq_before_reversed, seq_before[::-1],
                    _max_substitutions, _max_insertions, _max_l_dist,
                )

            for (subs_before, ins_before) in matches_before:
                for (subs_after, ins_after) in matches_after:
                    if (
                            subs_before + subs_after <= max_substitutions and
                            ins_before + ins_after <= max_insertions and
                            subs_before + subs_after + ins_before + ins_after <= max_l_dist
                    ):
                        matches.append(Match(
                            start=index - ngram_start - ins_before,
                            end=index - ngram_start + subseq_len + ins_after,
                            dist=subs_before + subs_after + ins_before + ins_after,
                        ))
                        matched_indexes |= set(range(
                            index - ngram_start - ins_before,
                            index - ngram_start - ins_before + max_insertions + 1,
                        ))

    return sorted(matches, key=lambda match: match.start)
Beispiel #13
0
 def search(self, subsequence, sequence, start_index=0, end_index=None):
     return list(search_exact(subsequence, sequence, start_index,
                              end_index))
Beispiel #14
0
def find_near_matches_no_deletions_ngrams(subsequence, sequence,
                                          search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * no deletions are allowed
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked

    max_substitutions = min(max_substitutions, max_l_dist)
    max_insertions = min(max_insertions, max_l_dist)

    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + max_insertions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_subs + max_ins!"
        )

    def make_match(start, end, dist):
        return Match(start, end, dist, matched=sequence[start:end])

    matches = []
    matched_indexes = set()

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_before_reversed = subseq_before[::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_insertions)
        end_index = min(seq_len,
                        seq_len - (subseq_len - ngram_end) + max_insertions)

        for index in search_exact(
                subsequence[ngram_start:ngram_end],
                sequence,
                start_index,
                end_index,
        ):
            if index - ngram_start in matched_indexes:
                continue

            seq_after = sequence[index + ngram_len:index + subseq_len -
                                 ngram_start + max_insertions]
            if seq_after.startswith(subseq_after):
                matches_after = [(0, 0)]
            else:
                matches_after = _expand(subseq_after, seq_after,
                                        max_substitutions, max_insertions,
                                        max_l_dist)
                if not matches_after:
                    continue

            _max_substitutions = max_substitutions - min(
                m[0] for m in matches_after)
            _max_insertions = max_insertions - min(m[1] for m in matches_after)
            _max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after)
            seq_before = sequence[index - ngram_start - _max_insertions:index]
            if seq_before.endswith(subseq_before):
                matches_before = [(0, 0)]
            else:
                matches_before = _expand(
                    subseq_before_reversed,
                    seq_before[::-1],
                    _max_substitutions,
                    _max_insertions,
                    _max_l_dist,
                )

            for (subs_before, ins_before) in matches_before:
                for (subs_after, ins_after) in matches_after:
                    if (subs_before + subs_after <= max_substitutions
                            and ins_before + ins_after <= max_insertions
                            and subs_before + subs_after + ins_before +
                            ins_after <= max_l_dist):
                        matches.append(
                            make_match(
                                start=index - ngram_start - ins_before,
                                end=index - ngram_start + subseq_len +
                                ins_after,
                                dist=subs_before + subs_after + ins_before +
                                ins_after,
                            ))
                        matched_indexes |= set(
                            range(
                                index - ngram_start - ins_before,
                                index - ngram_start - ins_before +
                                max_insertions + 1,
                            ))

    return sorted(matches, key=lambda match: match.start)