def expectedOutcomes(self, search_results, expected_outcomes):
     best_from_grouped_results = [
         get_best_match_in_group(group)
         for group in group_matches(search_results)
     ]
     best_from_grouped_exepected_outcomes = [
         get_best_match_in_group(group)
         for group in group_matches(expected_outcomes)
     ]
     return self.assertEqual(best_from_grouped_results,
                             best_from_grouped_exepected_outcomes)
 def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs):
     best_from_grouped_results = [
         get_best_match_in_group(group)
         for group in group_matches(search_results)
     ]
     best_from_grouped_exepected_outcomes = [
         get_best_match_in_group(group)
         for group in group_matches(expected_outcomes)
     ]
     return self.assertEqual(best_from_grouped_results,
                             best_from_grouped_exepected_outcomes,
                             *args, **kwargs)
Example #3
0
def trim_primer(
    seq_record: SeqIO.SeqRecord,
    primer_seqs: List[str],
    max_mismatch: Union[float, int] = 0.14,
) -> SeqIO.SeqRecord:
    """
    Trim primer sequences.

    Parameters
    ----------
    seq_record : Bio.Seq.SeqRecord
        input sequence record
    primer_seqs : list
        list of the foward and reverse primer sequnces
    max_mismatch : float
        Maximum number (or proportion) of mismatches allowed for searching primer
        sequeces (default: 0.14)

    """
    seq = seq_record.seq
    fwd, rev = primer_seqs
    rev_rc = revc(rev)
    len_fwd, len_rev = len(fwd), len(rev)

    if max_mismatch > 1:
        max_l_dist1 = max_l_dist2 = max_mismatch
    elif max_mismatch > 0:
        max_l_dist1 = round(len_fwd * max_mismatch)
        max_l_dist2 = round(len_rev * max_mismatch)
    else:
        raise ValueError("max_mismatch must be a positive value")

    m0 = find_near_matches(fwd, str(seq), max_l_dist=max_l_dist1)
    m1 = find_near_matches(rev_rc, str(seq), max_l_dist=max_l_dist2)

    if len(m0) > 0:
        match_fwd = get_best_match_in_group(m0)
    if len(m1) > 0:
        match_rev_rc = get_best_match_in_group(m1)

    if len(m0) > 0 and len(m1) > 0:
        tr = seq_record[match_fwd.end : match_rev_rc.start]
    elif len(m0) > 0:
        tr = seq_record[match_fwd.end :]
    elif len(m1) > 0:
        tr = seq_record[: match_rev_rc.start]
    else:
        tr = seq_record[:]

    return tr
Example #4
0
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')
    if max_l_dist < 0:
        raise ValueError('Maximum Levenshtein distance must be >= 0!')

    if max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_l_dist + 1) >= 3:
        return find_near_matches_levenshtein_ngrams(subsequence, sequence,
                                                    max_l_dist)

    else:
        matches = find_near_matches_levenshtein_linear_programming(
            subsequence, sequence, max_l_dist)
        match_groups = group_matches(matches)
        best_matches = [
            get_best_match_in_group(group) for group in match_groups
        ]
        return sorted(best_matches)
 def expectedOutcomes(self, search_result, expected_outcomes, *args,
                      **kwargs):
     best_from_groups = [
         get_best_match_in_group(group)
         for group in group_matches(search_result)
     ]
     self.assertEqual(search_result, best_from_groups, *args, **kwargs)
 def expectedOutcomes(self, search_result, expected_outcomes,
                      *args, **kwargs):
     best_from_groups = [
         get_best_match_in_group(group)
         for group in group_matches(search_result)
     ]
     self.assertEqual(search_result, best_from_groups, *args, **kwargs)
 def search(self, subsequence, sequence, max_subs):
     return [
         get_best_match_in_group(group) for group in group_matches(
             c_fnm_generic_ngrams(
                 subsequence, sequence,
                 LevenshteinSearchParams(max_subs, 0, 0, max_subs)))
     ]
    def find_near_matches_substitutions_ngrams(subsequence, sequence,
                                               max_substitutions):
        if not (
            isinstance(subsequence, text_type) or
            isinstance(sequence, text_type)
        ):
            try:
                results = _subs_only_fnm_ngram_byteslike(
                    subsequence, sequence, max_substitutions)
            except TypeError:
                pass
            else:
                matches = [
                    Match(
                        index,
                        index + len(subsequence),
                        count_differences_with_maximum(
                            sequence[index:index+len(subsequence)],
                            subsequence,
                            max_substitutions + 1,
                        ),
                    )
                    for index in results
                ]
                return [
                    get_best_match_in_group(group)
                    for group in group_matches(matches)
                ]

        return py_find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions)
Example #9
0
 def search(self, subsequence, sequence, max_l_dist):
     return [
         get_best_match_in_group(group) for group in group_matches(
             fnm_generic_lp(
                 subsequence, sequence,
                 LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist,
                                         max_l_dist)))
     ]
 def search(self, subsequence, sequence, max_l_dist):
     return [
         get_best_match_in_group(group)
         for group in group_matches(
             c_fnm_generic_lp(subsequence,
                              sequence,
                              LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist, max_l_dist))
         )
     ]
Example #11
0
 def search(self, subsequence, sequence, max_subs):
     if max_subs >= len(subsequence):
         self.skipTest("avoiding calling c_fnm_generic_ngrams() " +
                       "with max_subs >= len(subsequence)")
     return [
         get_best_match_in_group(group) for group in group_matches(
             c_fnm_generic_ngrams(
                 subsequence, sequence,
                 LevenshteinSearchParams(max_subs, 0, 0, max_subs)))
     ]
 def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs):
     consolidated_results = [
         get_best_match_in_group(group)
         for group in group_matches(search_results)
     ]
     consolidated_expected_outcomes = \
         consolidate_overlapping_matches(expected_outcomes)
     return self.assertEqual(consolidated_results,
                             consolidated_expected_outcomes,
                             *args, **kwargs)
 def search(self, subsequence, sequence, max_subs):
     if max_subs >= len(subsequence):
         self.skipTest("avoiding calling c_fnm_generic_ngrams() " +
                       "with max_subs >= len(subsequence)")
     return [
         get_best_match_in_group(group)
         for group in group_matches(
             c_fnm_generic_ngrams(subsequence,
                                  sequence,
                                  LevenshteinSearchParams(max_subs, 0, 0, max_subs))
         )
 ]
 def search(self, pattern, sequence, max_subs, max_ins, max_dels,
            max_l_dist=None):
     return [
         get_best_match_in_group(group)
         for group in group_matches(
             c_fnm_generic_ngrams(pattern,
                                  sequence,
                                  LevenshteinSearchParams(
                                      max_subs, max_ins,
                                      max_dels, max_l_dist,
                                  ))
         )
     ]
Example #15
0
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_l_dist + 1)
    if ngram_len == 0:
        raise ValueError(
            'the subsequence length must be greater than max_l_dist')

    matches = []
    for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before_reversed = subsequence[:ngram_start][::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_l_dist)
        end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
        for index in search_exact(subsequence[ngram_start:ngram_end], sequence,
                                  start_index, end_index):
            # try to expand left and/or right according to n_ngram
            dist_right, right_expand_size = _expand(
                subseq_after,
                sequence[index + ngram_len:index - ngram_start + subseq_len +
                         max_l_dist],
                max_l_dist,
            )
            if dist_right is None:
                continue
            dist_left, left_expand_size = _expand(
                subseq_before_reversed,
                sequence[max(0, index - ngram_start -
                             (max_l_dist - dist_right)):index][::-1],
                max_l_dist - dist_right,
            )
            if dist_left is None:
                continue
            assert dist_left + dist_right <= max_l_dist

            matches.append(
                Match(
                    start=index - left_expand_size,
                    end=index + ngram_len + right_expand_size,
                    dist=dist_left + dist_right,
                ))

    # don't return overlapping matches; instead, group overlapping matches
    # together and return the best match from each group
    match_groups = group_matches(matches)
    best_matches = [get_best_match_in_group(group) for group in match_groups]
    return sorted(best_matches)
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_ngrams_byteslike(subsequence, sequence,
                                         max_subs)
     matches = [
         Match(index,
               index + len(subsequence),
               count_differences_with_maximum(
                   sequence[index:index + len(subsequence)],
                   subsequence,
                   max_subs + 1,
               ),
               matched=sequence[index:index + len(subsequence)])
         for index in results
     ]
     return [
         get_best_match_in_group(group)
         for group in group_matches(matches)
     ]
 def search(self,
            pattern,
            sequence,
            max_subs,
            max_ins,
            max_dels,
            max_l_dist=None):
     return [
         get_best_match_in_group(group) for group in group_matches(
             c_fnm_generic_ngrams(
                 pattern, sequence,
                 LevenshteinSearchParams(
                     max_subs,
                     max_ins,
                     max_dels,
                     max_l_dist,
                 )))
     ]
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_ngrams_byteslike(subsequence, sequence,
                                         max_subs)
     matches = [
         Match(
             index,
             index + len(subsequence),
             count_differences_with_maximum(
                 sequence[index:index+len(subsequence)],
                 subsequence,
                 max_subs + 1,
             ),
         )
         for index in results
     ]
     return [
         get_best_match_in_group(group)
         for group in group_matches(matches)
     ]
Example #19
0
def find_near_matches_generic_ngrams(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    matches = list(
        _find_near_matches_generic_ngrams(subsequence, sequence,
                                          search_params))

    # don't return overlapping matches; instead, group overlapping matches
    # together and return the best match from each group
    match_groups = group_matches(matches)
    best_matches = [get_best_match_in_group(group) for group in match_groups]
    return sorted(best_matches)
Example #20
0
def find_near_matches_generic(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    # if the limitations are so strict that only exact matches are allowed,
    # use search_exact()
    if search_params.max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    # if the n-gram length would be at least 3, use the n-gram search method
    elif len(subsequence) // (search_params.max_l_dist + 1) >= 3:
        return find_near_matches_generic_ngrams(subsequence, sequence,
                                                search_params)

    # use the linear programming search method
    else:
        matches = find_near_matches_generic_linear_programming(
            subsequence, sequence, search_params)

        match_groups = group_matches(matches)
        best_matches = [
            get_best_match_in_group(group) for group in match_groups
        ]
        return sorted(best_matches)
Example #21
0
 def search(self, subsequence, sequence, max_l_dist):
     return [
         get_best_match_in_group(group) for group in group_matches(
             fnm_levenshtein_lp(subsequence, sequence, max_l_dist))
     ]