Beispiel #1
0
def suffix_search_lcs(a, b):
    # left and right bounds, max sizes
    len_a, len_b = len(a) + 1, len(b) + 1
    short = min(len(a), len(b))
    
    tree = SuffixTree(False, [a])
    
    # returns if there is a common substring of length m between a, b
    def found_common(m):
        return any(tree.findStringIdx(b[i-m:i]) for i in range(m, len_b))
    
    # exponentially increase l and r
    l, r = 0, 1
    while r < len_a and found_common(r):
        l, r = r + 1, r * 2
    r = min(r, short)
    
    # right-most binary search on if substring length is possible
    while l <= r:
        m = (l + r) // 2
        
        if found_common(m):
            l = m + 1
        else:
            r = m - 1
    
    return r
Beispiel #2
0
    def transform(self, s):
        """ Burrows-Wheeler transform with SuffixTree """
        assert self.EOS not in s, "Input string cannot contain null character ('%s')" % self.EOS

        # add end of text marker
        s += self.EOS

        st = SuffixTree()

        # construct a suffix tree O(n * log n)
        # can also be done in O(n) time
        st.add(s)

        # walk inorder to find sorted suffixes
        # only get the length of each suffix
        lens = self._walk(st.root)

        # as the last column letter will be left of the suffix
        # this means it's len(suffix) + 1
        # from the end of the input string s

        r = [0] * len(lens)
        for i in xrange(len(lens)):
            l = lens[i]
            if l == len(lens):
                r[i] = self.EOS
            else:
                r[i] = s[-l - 1]
        return ''.join(r)
Beispiel #3
0
 def __init__(self, dna):
     self.dna = dna
     self.suffix_tree = SuffixTree(len(dna))
     self.suffix_array = []
     self.first_col = []
     self.bwt = []
     self.ltof = []
     self.init_self()
Beispiel #4
0
def suffix_search_lcs(a, b):
    # left and right bounds, max sizes
    len_a, len_b = len(a) + 1, len(b) + 1
    short = min(len(a), len(b))

    tree = SuffixTree(True, [a])
    print('Completed suffix tree')

    # returns if there is a common substring of length m between a, b
    def found_common(m):
        return any(tree.findStringIdx(b[i - m:i]) for i in range(m, len_b))

    # exponentially increase l and r
    l, r = 0, 1
    print(l, r)
    while r < len_a and found_common(r):
        l, r = r + 1, r * 2
        print(l, r)
    r = min(r, short)
    print(l, r)

    # right-most binary search on if substring length is possible
    while l <= r:
        m = (l + r) // 2
        print(m)

        if found_common(m):
            l = m + 1
        else:
            r = m - 1

    print('Longest Common Substrings:')
    print('\n'.join(
        set(b[i - r:i] for i in range(r, len_b)
            if tree.findStringIdx(b[i - r:i]))))

    return r
Beispiel #5
0
@dataclass
class Point:
    lat: float
    lon: float

    def __hash__(self) -> int:
        return hash((self.lat, self.lon))

    def __eq__(self, o: object) -> bool:
        return self.__hash__() == o.__hash__()

    def __str__(self) -> str:
        return f'{self.lat},{self.lon}'


s = SuffixTree()
s.generate(
    (
        Point(1,1),
        Point(1,0),
        Point(0,1),
        Point(1,1),
        Point(1,0),
        Point(0,0),
    )
)
# s.generate('MISSISSIPPI$')

# annotate graph for vizualization
for i in range(1, s.order()):
    parent = s.parent_id(i)
Beispiel #6
0
'''
Created on Oct 23, 2018

@author: ckennington
'''
from stlm import STLM
from suffixtree import SuffixTree
from sequence import Sequence

trie = SuffixTree()

text = 'c a c a o'.split()

for w in text:
    print('adding', w)
    trie.add(w)

print('\n')
trie.print_tree()
print('\n')

trie.update_all_counts()

stlm = STLM(trie)

tests = [
    'c a'.split(), 'c a o'.split(), 'a o'.split(), 'o'.split(), 'c'.split()
]

for test in tests:
    seq = Sequence()