def expectedOutcomes(self, search_results, expected_outcomes):
     best_from_grouped_results = [
         get_best_match_in_group(group)
         for group in group_matches(search_results)
     ]
     best_from_grouped_exepected_outcomes = [
         get_best_match_in_group(group)
         for group in group_matches(expected_outcomes)
     ]
     return self.assertEqual(best_from_grouped_results,
                             best_from_grouped_exepected_outcomes)
 def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs):
     best_from_grouped_results = [
         get_best_match_in_group(group)
         for group in group_matches(search_results)
     ]
     best_from_grouped_exepected_outcomes = [
         get_best_match_in_group(group)
         for group in group_matches(expected_outcomes)
     ]
     return self.assertEqual(best_from_grouped_results,
                             best_from_grouped_exepected_outcomes,
                             *args, **kwargs)
 def expectedOutcomes(self, search_result, expected_outcomes,
                      *args, **kwargs):
     best_from_groups = [
         get_best_match_in_group(group)
         for group in group_matches(search_result)
     ]
     self.assertEqual(search_result, best_from_groups, *args, **kwargs)
 def expectedOutcomes(self, search_result, expected_outcomes, *args,
                      **kwargs):
     best_from_groups = [
         get_best_match_in_group(group)
         for group in group_matches(search_result)
     ]
     self.assertEqual(search_result, best_from_groups, *args, **kwargs)
 def search(self, subsequence, sequence, max_subs):
     return [
         get_best_match_in_group(group) for group in group_matches(
             c_fnm_generic_ngrams(
                 subsequence, sequence,
                 LevenshteinSearchParams(max_subs, 0, 0, max_subs)))
     ]
Ejemplo n.º 6
0
def find_variable_ssrs(align, min_variants=3, **kwargs):
    """
    Find variable SSR regions from a multiple sequence alignment

    Parameters
    ----------
    align: Bio.AlignIO.MultipleSeqAlignment
        input alignment
    min_variants: int
        Minimum number of variants for valiable SSR regions
    **kwargs
        The keyward arguments are used for find_ssr()

    See Also
    ----------
    find_ssrs()
    """
    matches = [find_ssrs(str(a.seq), max_interrupt=0, **kwargs) for a in align]
    match_groups = group_matches(chain(*matches))
    ssr_regions = []
    motifs = []
    for group in match_groups:
        if len(group) >= min_variants:
            starts, ends = list(zip(*[[rep.start, rep.end] for rep in group]))
            ssr_regions.append((np.min(starts), np.max(ends)))
            motifs.append(get_longest_RepData(group).motif)

    return ssr_regions, motifs
    def find_near_matches_substitutions_ngrams(subsequence, sequence,
                                               max_substitutions):
        if not (
            isinstance(subsequence, text_type) or
            isinstance(sequence, text_type)
        ):
            try:
                results = _subs_only_fnm_ngram_byteslike(
                    subsequence, sequence, max_substitutions)
            except TypeError:
                pass
            else:
                matches = [
                    Match(
                        index,
                        index + len(subsequence),
                        count_differences_with_maximum(
                            sequence[index:index+len(subsequence)],
                            subsequence,
                            max_substitutions + 1,
                        ),
                    )
                    for index in results
                ]
                return [
                    get_best_match_in_group(group)
                    for group in group_matches(matches)
                ]

        return py_find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions)
Ejemplo n.º 8
0
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')
    if max_l_dist < 0:
        raise ValueError('Maximum Levenshtein distance must be >= 0!')

    if max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_l_dist + 1) >= 3:
        return find_near_matches_levenshtein_ngrams(subsequence, sequence,
                                                    max_l_dist)

    else:
        matches = find_near_matches_levenshtein_linear_programming(
            subsequence, sequence, max_l_dist)
        match_groups = group_matches(matches)
        best_matches = [
            get_best_match_in_group(group) for group in match_groups
        ]
        return sorted(best_matches)
Ejemplo n.º 9
0
 def search(self, subsequence, sequence, max_l_dist):
     return [
         get_best_match_in_group(group) for group in group_matches(
             fnm_generic_lp(
                 subsequence, sequence,
                 LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist,
                                         max_l_dist)))
     ]
 def search(self, subsequence, sequence, max_l_dist):
     return [
         get_best_match_in_group(group)
         for group in group_matches(
             c_fnm_generic_lp(subsequence,
                              sequence,
                              LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist, max_l_dist))
         )
     ]
Ejemplo n.º 11
0
 def test_separate(self):
     matches = [
         Match(start=19, end=29, dist=1, matched='x'*10),
         Match(start=42, end=52, dist=1, matched='x'*10),
         Match(start=99, end=109, dist=0, matched='x'*10),
     ]
     self.assertEqual(
         group_matches(matches),
         [{m} for m in matches],
     )
Ejemplo n.º 12
0
 def test_separate_with_duplicate(self):
     matches = [
         Match(start=19, end=29, dist=1),
         Match(start=42, end=52, dist=1),
         Match(start=99, end=109, dist=0),
     ]
     self.assertEqual(
         group_matches(matches + [matches[1]]),
         [set([m]) for m in matches],
     )
 def expectedOutcomes(self, search_results, expected_outcomes, *args, **kwargs):
     consolidated_results = [
         get_best_match_in_group(group)
         for group in group_matches(search_results)
     ]
     consolidated_expected_outcomes = \
         consolidate_overlapping_matches(expected_outcomes)
     return self.assertEqual(consolidated_results,
                             consolidated_expected_outcomes,
                             *args, **kwargs)
Ejemplo n.º 14
0
 def search(self, subsequence, sequence, max_subs):
     if max_subs >= len(subsequence):
         self.skipTest("avoiding calling c_fnm_generic_ngrams() " +
                       "with max_subs >= len(subsequence)")
     return [
         get_best_match_in_group(group) for group in group_matches(
             c_fnm_generic_ngrams(
                 subsequence, sequence,
                 LevenshteinSearchParams(max_subs, 0, 0, max_subs)))
     ]
Ejemplo n.º 15
0
 def test_separate_with_duplicate(self):
     matches = [
         Match(start=19, end=29, dist=1),
         Match(start=42, end=52, dist=1),
         Match(start=99, end=109, dist=0),
     ]
     self.assertEqual(
         group_matches(matches + [matches[1]]),
         [set([m]) for m in matches],
     )
 def search(self, subsequence, sequence, max_subs):
     if max_subs >= len(subsequence):
         self.skipTest("avoiding calling c_fnm_generic_ngrams() " +
                       "with max_subs >= len(subsequence)")
     return [
         get_best_match_in_group(group)
         for group in group_matches(
             c_fnm_generic_ngrams(subsequence,
                                  sequence,
                                  LevenshteinSearchParams(max_subs, 0, 0, max_subs))
         )
 ]
 def search(self, pattern, sequence, max_subs, max_ins, max_dels,
            max_l_dist=None):
     return [
         get_best_match_in_group(group)
         for group in group_matches(
             c_fnm_generic_ngrams(pattern,
                                  sequence,
                                  LevenshteinSearchParams(
                                      max_subs, max_ins,
                                      max_dels, max_l_dist,
                                  ))
         )
     ]
Ejemplo n.º 18
0
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_l_dist + 1)
    if ngram_len == 0:
        raise ValueError(
            'the subsequence length must be greater than max_l_dist')

    matches = []
    for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before_reversed = subsequence[:ngram_start][::-1]
        subseq_after = subsequence[ngram_end:]
        start_index = max(0, ngram_start - max_l_dist)
        end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
        for index in search_exact(subsequence[ngram_start:ngram_end], sequence,
                                  start_index, end_index):
            # try to expand left and/or right according to n_ngram
            dist_right, right_expand_size = _expand(
                subseq_after,
                sequence[index + ngram_len:index - ngram_start + subseq_len +
                         max_l_dist],
                max_l_dist,
            )
            if dist_right is None:
                continue
            dist_left, left_expand_size = _expand(
                subseq_before_reversed,
                sequence[max(0, index - ngram_start -
                             (max_l_dist - dist_right)):index][::-1],
                max_l_dist - dist_right,
            )
            if dist_left is None:
                continue
            assert dist_left + dist_right <= max_l_dist

            matches.append(
                Match(
                    start=index - left_expand_size,
                    end=index + ngram_len + right_expand_size,
                    dist=dist_left + dist_right,
                ))

    # don't return overlapping matches; instead, group overlapping matches
    # together and return the best match from each group
    match_groups = group_matches(matches)
    best_matches = [get_best_match_in_group(group) for group in match_groups]
    return sorted(best_matches)
Ejemplo n.º 19
0
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_ngrams_byteslike(subsequence, sequence,
                                         max_subs)
     matches = [
         Match(index,
               index + len(subsequence),
               count_differences_with_maximum(
                   sequence[index:index + len(subsequence)],
                   subsequence,
                   max_subs + 1,
               ),
               matched=sequence[index:index + len(subsequence)])
         for index in results
     ]
     return [
         get_best_match_in_group(group)
         for group in group_matches(matches)
     ]
 def search(self,
            pattern,
            sequence,
            max_subs,
            max_ins,
            max_dels,
            max_l_dist=None):
     return [
         get_best_match_in_group(group) for group in group_matches(
             c_fnm_generic_ngrams(
                 pattern, sequence,
                 LevenshteinSearchParams(
                     max_subs,
                     max_ins,
                     max_dels,
                     max_l_dist,
                 )))
     ]
Ejemplo n.º 21
0
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_ngrams_byteslike(subsequence, sequence,
                                         max_subs)
     matches = [
         Match(
             index,
             index + len(subsequence),
             count_differences_with_maximum(
                 sequence[index:index+len(subsequence)],
                 subsequence,
                 max_subs + 1,
             ),
         )
         for index in results
     ]
     return [
         get_best_match_in_group(group)
         for group in group_matches(matches)
     ]
Ejemplo n.º 22
0
def find_near_matches_generic_ngrams(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    matches = list(
        _find_near_matches_generic_ngrams(subsequence, sequence,
                                          search_params))

    # don't return overlapping matches; instead, group overlapping matches
    # together and return the best match from each group
    match_groups = group_matches(matches)
    best_matches = [get_best_match_in_group(group) for group in match_groups]
    return sorted(best_matches)
Ejemplo n.º 23
0
def find_near_matches_generic(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    # if the limitations are so strict that only exact matches are allowed,
    # use search_exact()
    if search_params.max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    # if the n-gram length would be at least 3, use the n-gram search method
    elif len(subsequence) // (search_params.max_l_dist + 1) >= 3:
        return find_near_matches_generic_ngrams(subsequence, sequence,
                                                search_params)

    # use the linear programming search method
    else:
        matches = find_near_matches_generic_linear_programming(
            subsequence, sequence, search_params)

        match_groups = group_matches(matches)
        best_matches = [
            get_best_match_in_group(group) for group in match_groups
        ]
        return sorted(best_matches)
Ejemplo n.º 24
0
def find_ssrs(
    seq: str,
    min_repeats: int = 3,
    motif: Union[str, List[str]] = [],
    motif_class: Union[str, List[str]] = [],
    min_motif_len: int = 2,
    max_motif_len: int = 6,
    max_interrupt: int = 0,
    start: int = 0,
    end: Optional[int] = None,
    **kwargs,
) -> Optional[List[RepData]]:
    """
    Find short sequence repeats in the given sequence string.

    Parameters
    ----------
    seq: str
        input sequence string
    min_repeats: int
        minimum number of repeats to search (default: 3)
    motif: str or list
        repeat motif to search (default: None)
    motif_class: str or list
        class of repeat motif to search (default: None)
    min_motif_len: int
        minimum length of repeat motif (default: 2)
    max_motif_len: int
        maximum length of repeat motif (default: 6)
    max_interrupt: int
        maximum length of interruption to allow (default: None)
    start: int
        starting postion where ssr needs to be search
    end: int
        ending postion where ssr needs to be search

    """
    subseq = seq[start:end]
    matches = []
    if motif:
        if isinstance(motif, list):
            motifs = motif
        elif isinstance(motif, str):
            motifs = [motif]
        else:
            raise TypeError("motif must be a str or list")

        motif_classes = [get_motif_class(m) for m in motifs]
        for m, mcls in zip(motifs, motif_classes):
            for s, e, n, rs in _find_ssrs(subseq, m, min_repeats,
                                          max_interrupt):
                matches.append(RepData(s + start, e + start, n, rs, m, mcls))
    else:
        if not motif_class:
            motif_classes = list(
                gen_motif_classes(min_motif_len, max_motif_len))
        elif isinstance(motif_class, list):
            motif_classes = motif_class
        elif isinstance(motif_class, str):
            motif_classes = [motif_class]
        else:
            raise TypeError("motif_class must be a str or list")

        for mcls in motif_classes:
            for m in motif_set(mcls):
                for s, e, n, rs in _find_ssrs(subseq, m, min_repeats,
                                              max_interrupt):
                    matches.append(
                        RepData(s + start, e + start, n, rs, m, mcls))

    match_groups = group_matches(matches)
    best_matches = [get_longest_RepData(g) for g in match_groups if g]

    return list(sorted(best_matches, key=lambda m: m.start))
Ejemplo n.º 25
0
 def search(self, subsequence, sequence, max_l_dist):
     return [
         get_best_match_in_group(group) for group in group_matches(
             fnm_levenshtein_lp(subsequence, sequence, max_l_dist))
     ]