Beispiel #1
0
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')
    if max_l_dist < 0:
        raise ValueError('Maximum Levenshtein distance must be >= 0!')

    if max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_l_dist + 1) >= 3:
        return find_near_matches_levenshtein_ngrams(subsequence, sequence,
                                                    max_l_dist)

    else:
        matches = find_near_matches_levenshtein_linear_programming(
            subsequence, sequence, max_l_dist)
        match_groups = group_matches(matches)
        best_matches = [
            get_best_match_in_group(group) for group in match_groups
        ]
        return sorted(best_matches)
def _find_near_matches_generic_ngrams(subsequence, sequence, search_params):
    max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked

    # optimization: prepare some often used things in advance
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_l_dist + 1)
    if ngram_len == 0:
        raise ValueError(
            'the subsequence length must be greater than max_l_dist')

    for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        start_index = max(0, ngram_start - max_l_dist)
        end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
        for index in search_exact(subsequence[ngram_start:ngram_end], sequence,
                                  start_index, end_index):
            # try to expand left and/or right according to n_ngram
            for match in find_near_matches_generic_linear_programming(
                    subsequence,
                    sequence[max(0, index - ngram_start - max_l_dist):index -
                             ngram_start + subseq_len + max_l_dist],
                    search_params,
            ):
                yield match._replace(
                    start=match.start +
                    max(0, index - ngram_start - max_l_dist),
                    end=match.end + max(0, index - ngram_start - max_l_dist),
                )
def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    _check_arguments(subsequence, sequence, max_substitutions)

    if max_substitutions == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_substitutions + 1) >= 3:
        return find_near_matches_substitutions_ngrams(
            subsequence,
            sequence,
            max_substitutions,
        )

    else:
        return find_near_matches_substitutions_lp(
            subsequence,
            sequence,
            max_substitutions,
        )
Beispiel #4
0
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_l_dist + 1)
    if ngram_len == 0:
        raise ValueError(
            'the subsequence length must be greater than max_l_dist')

    matches = []
    for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before_reversed = subsequence[:ngram_start][::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_l_dist)
        end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
        for index in search_exact(subsequence[ngram_start:ngram_end], sequence,
                                  start_index, end_index):
            # try to expand left and/or right according to n_ngram
            dist_right, right_expand_size = _expand(
                subseq_after,
                sequence[index + ngram_len:index - ngram_start + subseq_len +
                         max_l_dist],
                max_l_dist,
            )
            if dist_right is None:
                continue
            dist_left, left_expand_size = _expand(
                subseq_before_reversed,
                sequence[max(0, index - ngram_start -
                             (max_l_dist - dist_right)):index][::-1],
                max_l_dist - dist_right,
            )
            if dist_left is None:
                continue
            assert dist_left + dist_right <= max_l_dist

            matches.append(
                Match(
                    start=index - left_expand_size,
                    end=index + ngram_len + right_expand_size,
                    dist=dist_left + dist_right,
                ))

    # don't return overlapping matches; instead, group overlapping matches
    # together and return the best match from each group
    match_groups = group_matches(matches)
    best_matches = [get_best_match_in_group(group) for group in match_groups]
    return sorted(best_matches)
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                            max_substitutions):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_substitutions!")

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_after = subsequence[ngram_end:]
        for index in search_exact(
                subsequence[ngram_start:ngram_end],
                sequence,
                ngram_start,
                seq_len - (subseq_len - ngram_end),
        ):
            n_substitutions = 0
            seq_before = sequence[index - ngram_start:index]
            if subseq_before != seq_before:
                n_substitutions += count_differences_with_maximum(
                    seq_before, subseq_before,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            seq_after = sequence[index + ngram_len:index - ngram_start +
                                 subseq_len]
            if subseq_after != seq_after:
                if n_substitutions == max_substitutions:
                    continue
                n_substitutions += count_differences_with_maximum(
                    seq_after, subseq_after,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            yield Match(
                start=index - ngram_start,
                end=index - ngram_start + subseq_len,
                dist=n_substitutions,
            )
def has_near_match_substitutions(subsequence, sequence, max_substitutions):
    _check_arguments(subsequence, sequence, max_substitutions)

    if max_substitutions == 0:
        for start_index in search_exact(subsequence, sequence):
            return True
        return False

    elif len(subsequence) // (max_substitutions + 1) >= 3:
        return has_near_match_substitutions_ngrams(
            subsequence,
            sequence,
            max_substitutions,
        )

    else:
        return has_near_match_substitutions_lp(
            subsequence,
            sequence,
            max_substitutions,
        )
Beispiel #7
0
def choose_search_func(search_params):
    max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked

    # if the limitations are so strict that only exact matches are allowed,
    # use search_exact()
    if search_params.max_l_dist == 0:
        return lambda subsequence, sequence, search_params: [
            Match(index, index + len(subsequence), 0)
            for index in search_exact(subsequence, sequence)
        ]
        # return [
        #     Match(start_index, start_index + len(subsequence), 0)
        #     for start_index in search_exact(subsequence, sequence)
        # ]

    # if only substitutions are allowed, use find_near_matches_substitutions()
    elif max_insertions == 0 and max_deletions == 0:
        # max_subs = \
        #     min([x for x in [max_l_dist, max_substitutions] if x is not None])
        return lambda subsequence, sequence, search_params:\
            find_near_matches_substitutions(
                subsequence, sequence,
                min([x for x in [search_params.max_l_dist, search_params.max_substitutions] if x is not None])
            )

    # if it is enough to just take into account the maximum Levenshtein
    # distance, use find_near_matches_levenshtein()
    elif max_l_dist <= min(
        (max_substitutions if max_substitutions is not None else (1 << 29)),
        (max_insertions if max_insertions is not None else (1 << 29)),
        (max_deletions if max_deletions is not None else (1 << 29)),
    ):
        return lambda subsequence, sequence, search_params:\
            find_near_matches_levenshtein(subsequence, sequence, search_params.max_l_dist)

    # if none of the special cases above are met, use the most generic version
    else:
        return find_near_matches_generic
def find_near_matches_generic(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    # if the limitations are so strict that only exact matches are allowed,
    # use search_exact()
    if search_params.max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    # if the n-gram length would be at least 3, use the n-gram search method
    elif len(subsequence) // (search_params.max_l_dist + 1) >= 3:
        return find_near_matches_generic_ngrams(subsequence, sequence,
                                                search_params)

    # use the linear programming search method
    else:
        matches = find_near_matches_generic_linear_programming(
            subsequence, sequence, search_params)

        match_groups = group_matches(matches)
        best_matches = [
            get_best_match_in_group(group) for group in match_groups
        ]
        return sorted(best_matches)
Beispiel #9
0
 def search(self, subsequence, sequence, start_index=0, end_index=None):
     return list(search_exact(subsequence, sequence, start_index,
                              end_index))
Beispiel #10
0
def find_near_matches_no_deletions_ngrams(subsequence, sequence,
                                          search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the number of character substitutions must be less than max_substitutions
    * no deletions or insertions are allowed
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked

    max_substitutions = min(max_substitutions, max_l_dist)
    max_insertions = min(max_insertions, max_l_dist)

    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + max_insertions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_subs + max_ins!"
        )

    matches = []
    matched_indexes = set()

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_before_reversed = subseq_before[::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_insertions)
        end_index = min(seq_len,
                        seq_len - (subseq_len - ngram_end) + max_insertions)

        for index in search_exact(
                subsequence[ngram_start:ngram_end],
                sequence,
                start_index,
                end_index,
        ):
            if index - ngram_start in matched_indexes:
                continue

            seq_after = sequence[index + ngram_len:index + subseq_len -
                                 ngram_start + max_insertions]
            if seq_after.startswith(subseq_after):
                matches_after = [(0, 0)]
            else:
                matches_after = _expand(subseq_after, seq_after,
                                        max_substitutions, max_insertions,
                                        max_l_dist)
                if not matches_after:
                    continue

            _max_substitutions = max_substitutions - min(
                m[0] for m in matches_after)
            _max_insertions = max_insertions - min(m[1] for m in matches_after)
            _max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after)
            seq_before = sequence[index - ngram_start - _max_insertions:index]
            if seq_before.endswith(subseq_before):
                matches_before = [(0, 0)]
            else:
                matches_before = _expand(
                    subseq_before_reversed,
                    seq_before[::-1],
                    _max_substitutions,
                    _max_insertions,
                    _max_l_dist,
                )

            for (subs_before, ins_before) in matches_before:
                for (subs_after, ins_after) in matches_after:
                    if (subs_before + subs_after <= max_substitutions
                            and ins_before + ins_after <= max_insertions
                            and subs_before + subs_after + ins_before +
                            ins_after <= max_l_dist):
                        matches.append(
                            Match(
                                start=index - ngram_start - ins_before,
                                end=index - ngram_start + subseq_len +
                                ins_after,
                                dist=subs_before + subs_after + ins_before +
                                ins_after,
                            ))
                        matched_indexes |= set(
                            range(
                                index - ngram_start - ins_before,
                                index - ngram_start - ins_before +
                                max_insertions + 1,
                            ))

    return sorted(matches, key=lambda match: match.start)