Esempio n. 1
0
 def test_alignment(self):
     self.assertEqual(align_glocal("test", 'test'), (0, 0))
     self.assertEqual(align_glocal("test", ' test'), (0, 1))
     self.assertEqual(align_glocal(" test", 'test'), (1, 0))
     self.assertEqual(align_glocal("tast", 'test'), (1, 0))
     self.assertEqual(align_glocal("teste", 'tast'), (2, 0))
     self.assertEqual(align_glocal("test", ' tast'), (1, 1))
Esempio n. 2
0
def find_ranges(needle, genome, max_edit):
    """
    Finds all matches in the FM-index genome for the needle within max_edit edit distance
    """
    #Calculate the biggest substring that must exist in the haystack if the needle matches
    # with the edit distance. This assumes worst case distribution of deletion-edits.
    k = (len(needle) - max_edit) / (max_edit + 1)

    #Generate a generator of all substrings of length k.
    kmers = (i for i in xrange(len(needle) - k + 1))

    #Skip all the hard work if the edit distance is zero.
    if not max_edit:
        return [(h, 0) for h in genome.search(needle)], 0

    #Find where all of these kmers match exactly
    kmerhits = (genome.search(needle[kmer:kmer + k]) for kmer in kmers)

    #Create all a sorted set of intervals
    ranges = SortedSet(updator=OverlappingIntervalsUpdator)

    #Iterate over all of the kmers and kmers matches
    for i, kmer in enumerate(kmerhits):
        for hit in kmer:
            #Check for any existing possible ranges that are already in our list
            overlaps = ranges.overlap_point(hit)
            if not len(overlaps):
                #Create a new range consisting of the worst-case given the position of the kmer in the needle
                ranges.add([hit - i - max_edit, hit - i + len(needle) + max_edit, 1])
            else:
                #Incrememnt the number of kmers in each possible range
                for overlap in overlaps:
                    overlap[2] += 1

    best_edit = max_edit

    #Iterate over all potential alignments and use dynamic programming to determine whether it actually
    #fits within edit distance
    #Possible matches defined as the ranges with > max_edit distance kmers found in the match
    alignments = []
    for potential_alignment in (r for r in ranges if r[2] > max_edit):
        ed, start = align_glocal(needle, genome.seq[potential_alignment[0]:potential_alignment[1]])
        if ed < best_edit:
            best_edit = ed
            alignments = []
        alignments.append((potential_alignment[0] + start, ed))

    return alignments, best_edit