Exemple #1
0
 def __init__(self, D_plus, D_minus, alphabet):
     self.word = '1'.join(D_plus) + '1' + '0'.join(D_minus) + '0$'
     self.suffixTree = SuffixTree(self.word)
     self.alphabet = alphabet | {'0', '1', '$'}
     self.edges = self.suffixTree.edges
     if self.__has_edges(0):  # bug fix
         self.suffixTree.nodes[0].suffix_node = 0
Exemple #2
0
    def init_with_text(self, text):
        self.__text = text + "$"
        self.__suffix_array = []

        tree = SuffixTree()
        tree.init_with_text(self.__text[:-1])

        self.__suffix_array = tree.gen_suffix_array()
def constructSuffixArray(main_sequence):

    tree = SuffixTree(len(main_sequence))

    for char in main_sequence:
        tree.add_char(char)
#    tree.print_graphviz_tree()

    return tree.depthFirstSearch()
def find_shortest_nonshared_substring(seq_1, seq_2):
    st = SuffixTree(seq_2)
    for k in range(2, len(seq_1) + 1):
        shared = True
        for kmer in generate_kmers(seq_1, k):
            if not st.has_substring(kmer):
                shared = False
                break
        if not shared:
            return kmer
    def test_init(self):
        st = SuffixTree('foo')
        rows = st.get_rows()

        self.assertEqual(len(rows), 5)  # root plus len(foo) + null term
        self.assertEqual(len(rows[0]), 1)  # The root level
        self.assertEqual(len(rows[1]), 3)  # 'f', 'o', and '\0'
        self.assertEqual(len(rows[2]), 3)  # 'o' (of f), 'o' (of o),
                                           # '\0' (of o)
        self.assertEqual(len(rows[3]), 2)  # 'o' (of o of f), '\0' (of o of o)
        self.assertEqual(len(rows[4]), 1)  # '\0' (of o)

        root_item = rows[0][0]
        self.assertEqual(root_item.let, None)
        self.assertEqual(root_item.parent, None)
        self.assertEqual(root_item.depth, 0)
        self.assertEqual(root_item.positions, set([None]))

        self.assertIn('f', root_item.children)
        self.assertIn('o', root_item.children)
        self.assertIn("\0", root_item.children)

        f_item = root_item.children['f']
        o_item = root_item.children['o']
        null_item = root_item.children['\0']

        row1 = (f_item, o_item, null_item)

        self.assertEqual([item.let for item in row1], ['f', 'o', '\0'])
        self.assertEqual([item.parent for item in row1], 3 * [rows[0][0], ])
        self.assertEqual([item.depth for item in row1], [1, 1, 1])
        self.assertEqual([item.positions for item in row1],
                         [set([0]), set([1, 2]), set([3])])

        f_child_o = f_item.children['o']
        self.assertEqual(f_child_o.let, 'o')
        self.assertEqual(f_child_o.parent, f_item)
        self.assertEqual(f_child_o.depth, 2)
        self.assertEqual(f_child_o.positions, set([1]))

        o_child_o = o_item.children['o']
        self.assertEqual(o_child_o.let, 'o')
        self.assertEqual(o_child_o.parent, o_item)
        self.assertEqual(o_child_o.depth, 2)
        self.assertEqual(o_child_o.positions, set([2]))

        # FIXME: Comprehensive testing is really called for here. Test every
        # node in the tree.

        lowest_null_parent = f_child_o.children['o']
        lowest_null = lowest_null_parent.children['\0']
        self.assertEqual(lowest_null.let, '\0')
        self.assertEqual(lowest_null.parent, lowest_null_parent)
        self.assertEqual(lowest_null.depth, 4)
        self.assertEqual(lowest_null.positions, set([3]))
Exemple #6
0
def test_insertion(string: str):
    st = SuffixTree()
    st.insert_string(string)
    assert len([node for node in st.nodes
                if node.end is None]) == len(string) + 2
    for suffix in suffixes(string):
        assert suffix in st
 def __init__(self, D_plus, D_minus, alphabet):
     self.word = '1'.join(D_plus) + '1' + '0'.join(D_minus) + '0$'
     self.suffixTree = SuffixTree(self.word)
     self.alphabet = alphabet | {'0', '1', '$'}
     self.edges = self.suffixTree.edges
     if self.__has_edges(0): # bug fix
         self.suffixTree.nodes[0].suffix_node = 0
Exemple #8
0
 def test_repeated_string(self):
     st = SuffixTree("aaa")
     self.assertEqual(st.find_substring('a'), 0)
     self.assertEqual(st.find_substring('aa'), 0)
     self.assertEqual(st.find_substring('aaa'), 0)
     self.assertEqual(st.find_substring('b'), -1)
     self.assertTrue(st.has_substring('a'))
     self.assertTrue(st.has_substring('aa'))
     self.assertTrue(st.has_substring('aaa'))
Exemple #9
0
def test_insert_multiple(lst):
    st = SuffixTree()
    for string in lst:
        st.insert_string(string)
    assert (len([node for node in st.nodes if node.end is None
                 ]) == sum(len(string) + 1 for string in lst) + 1)
    for suffix in suffixes(string):
        assert suffix in st
Exemple #10
0
 def measure_suffix_tree(text, name):
     with open("report.txt", "a+", encoding="utf-8") as report:
         report.write(
             "Measuring time for Suffix Tree --- {}\n".format(name))
         start_ns = time.time_ns()
         start_s = time.time()
         tree = SuffixTree(text)
         end_ns = time.time_ns()
         end_s = time.time()
         report.write("It took --- {} ns\nIt took --- {} s\n\n".format(
             end_ns - start_ns, end_s - start_s))
 def test_get_depth_tuples(self):
     tree = SuffixTree.from_seq(['a', 'a', 'b'])
     expect(tree.items()).to(
         equal(
             set((
                 (('a', '$'), 2),
                 (('a', 'a', '$'), 1),
                 (('a', 'a', 'b', '$'), 1),
                 (('a', 'b', '$'), 1),
                 (('b', '$'), 1),
             ))))
Exemple #12
0
def create_suffix_tree(t):
    process = psutil.Process(os.getpid())
    start = time.clock()
    suffix_tree = SuffixTree(t)
    end = time.clock()
    mem = process.memory_info().rss / 1024 / 1024
    print('------------------------------------------------------')
    print('Creating index struct - memory usage: ' + str(mem) + ' Mb')
    print('Creating index struct - time elapsed: ' + str(end - start) + 's')
    print('------------------------------------------------------')
    return suffix_tree
 def test_search(self):
     st = SuffixTree('This is a test')
     self.assertEqual(st.search('T'), [0])
     self.assertEqual(st.search('Th'), [0])
     self.assertEqual(st.search('h'), [1])
     self.assertEqual(st.search('is'), [2, 5])
     self.assertEqual(st.search('qqqqq'), None)
Exemple #14
0
def test_implicit():
    st = SuffixTree('abcd')
    st_d = st.as_dict()
    assert (st_d == {'d': {}, 'cd': {}, 'bcd': {}, 'abcd': {}})

    st = SuffixTree('xabxa')
    st_d = st.as_dict()
    assert (st_d == {'bxa': {}, 'abxa': {}, 'xabxa': {}})
Exemple #15
0
def make_suffix_tree(filename):
    final = []
    string = ''
    title = ''
    flag = 0
    with open(filename, "r", encoding="utf8") as f:
        for i in f.readlines():
            line = i.strip()
            if len(line) == 0:
                flag = flag - 1
                continue
            elif flag == 0 and len(line) != 0:  # It is a title
                final.append(SuffixTree(string, title, False))
                string = ''
                title = line
                flag = 2
                continue
            elif flag == 1 and len(line) != 0:  # Content
                flag = 2
                string += line + ' '
            elif flag == 2:
                string += line + ' '
    final.append(SuffixTree(string, title, False))
    return final
 def test_two_elements_in_tree(self):
     tree = SuffixTree.from_seq(['a', 'a'])
     expect(tree['a'].count).to(equal(2))
     expect(tree['a']['a'].count).to(equal(1))
Exemple #17
0
def test_contains_not(text, search_term):
    st = SuffixTree()
    st.insert_string(text)
    assert search_term not in st
Exemple #18
0
    def test_chinese_text(self):
	
	st = SuffixTree(codecs.open("test.txt", encoding="utf-8").read())
	self.assertTrue(st.find_substring(u'概括性总结'))
Exemple #19
0
    def test_text_string(self):
	f = codecs.open("test.txt", encoding='utf-8')
	st = SuffixTree(f.read())
	self.assertTrue(st.has_substring(u'a'))
Exemple #20
0
 def test_empty_string(self):
     st = SuffixTree('')
     self.assertEqual(st.find_substring('not there'), -1)
     self.assertEqual(st.find_substring(''), -1)
     self.assertFalse(st.has_substring('not there'))
     self.assertFalse(st.has_substring(''))
 def test_empty_string(self):
     st = SuffixTree('')
     self.assertEqual(st.find_substring('not there'), -1)
     self.assertEqual(st.find_substring(''), -1)
     self.assertFalse(st.has_substring('not there'))
     self.assertFalse(st.has_substring(''))
 def test_case_sensitivity(self):
     f = open("test.txt")
     st = SuffixTree(f.read(), case_insensitive=True)
     self.assertEqual(st.find_substring('ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 1830)
Exemple #23
0
import sys

from suffix_tree import SuffixTree

with open("test.txt") as f:
    st = SuffixTree(f.read(), case_insensitive=True)

print(st.find_word_by_prefix(sys.argv[1]))
    #    for i in range(0, len(seqs[0]) - l + 1):
    #        s = seqs[0][i:i + l]
    #        #print(s)
    #        bad = False
    #        for lv in stree.leaves:
    #            if lv.pathLabel.startswith(s):
    #                bad = True
    #                continue
    #        if not bad:
    #            if len(s) < len(shortest):
    #                shortest = s
    #                print(s)
    #
    #print(shortest)

    stree = SuffixTree(seqs[0] + seqs[1])

    shortest = seqs[0]
    for l in stree.leaves:
        print(l.pathLabel)
        if len(l.pathLabel) < len(shortest):
            shortest = l.pathLabel

    print(shortest)

    #print(list((n.pathLabel for n in stree.innerNodes)))

    #res = ''
    #for l in stree.postOrderNodes:
    #    print(l.edgeLabel)
    #    res += l.edgeLabel + '\n'
Exemple #25
0
def test_create():
    st = SuffixTree()
    assert st is not None
 def test_repr(self):
     st = SuffixTree("t")
     output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \t\nt\n'
     self.assertEqual(st.__repr__(), output)
Exemple #27
0
import re  # regular expression
import dataset

fables = []  # Stores the SuffixTree Objects. One object per Story.
string = ''
title = ''
flag = 0
# Read from the Data file.
with open("AesopTales.txt", "r") as f:
    for i in f.readlines():
        line = i.strip()
        if len(line) == 0:
            flag = flag - 1
            continue
        elif flag == 0 and len(line) != 0:  # It is a title
            fables.append(SuffixTree(string, title, True))
            string = ''
            title = line
            flag = 2
            continue
        elif flag == 1 and len(line) != 0:  # Content
            flag = 2
            string += line + ' '
        elif flag == 2:
            string += line + ' '

fables.append(SuffixTree(string, title, True))
del fables[0]
n = len(fables)  # There are 312 stories in the Given Dataset.
print("Number of stories Read : ", n)
Exemple #28
0
    artificial_text_input += c

i = 0
for input in basic_inputs:
    i = i + 1
    trie = Trie(input)
    check_validity(input,
                   trie,
                   1,
                   test_string="Trie- Basic test no. " + str(i))
    simple_suffix_tree = SuffixTreeNaive(input)
    check_validity(input,
                   simple_suffix_tree,
                   1,
                   test_string="Simple SuffixTree- Basic test no. " + str(i))
    fast_suffix_tree = SuffixTree(input)
    check_validity(input,
                   fast_suffix_tree,
                   1,
                   test_string="McCreight- Basic test no. " + str(i))

trie = Trie(natural_text_input[:2000])
check_validity(natural_text_input[:2000],
               trie,
               10,
               test_string="Trie- law act test")
simple_suffix_tree = SuffixTreeNaive(natural_text_input)
check_validity(natural_text_input,
               simple_suffix_tree,
               1000,
               test_string="Simple SuffixTree- law act text")
class SuffixClassifier(object):

    def __init__(self, D_plus, D_minus, alphabet):
        self.word = '1'.join(D_plus) + '1' + '0'.join(D_minus) + '0$'
        self.suffixTree = SuffixTree(self.word)
        self.alphabet = alphabet | {'0', '1', '$'}
        self.edges = self.suffixTree.edges
        if self.__has_edges(0): # bug fix
            self.suffixTree.nodes[0].suffix_node = 0

    def print_suffix_tree(self, printDirectDebug = True):
        print(self.word)
        self.print_tree_part('0', 0)
        print()
        if printDirectDebug:
            print(self.suffixTree.__repr__())

    def print_tree_part(self, prefix, nodeId):
        found = False
        first = True
        for e in self.suffixTree.edges.values():
            if e.source_node_index == nodeId:
                found = True
                text = self.suffixTree.string[e.first_char_index:e.last_char_index+1]
                if first:
                    first = False
                else:
                    prefix = (' ' * (len(prefix) - 1)) + '\\'
                data_node = self.__print_data_node(e.source_node_index)
                link = "%s%s--'%s'--%d" % (prefix, data_node, text, e.dest_node_index)
                if not self.__has_edges(e.dest_node_index):
                    link += self.__print_data_node(e.dest_node_index)
                self.print_tree_part(link, e.dest_node_index)
        if found == False:
            print(('\n' if prefix[0] in ['0','\\'] else '') + prefix)

    def __print_data_node(self, nodeId):
        if hasattr(self.suffixTree.nodes[nodeId], 'negative'):
            node = self.suffixTree.nodes[nodeId]
            return "(%d,%d)" % (node.negative, node.positive)

        return ""

    def truncate_tree(self):
        self.nodes = {} # convert to dictionary because we need node
                   # at specific index after remove any elements before
        for i in range(len(self.suffixTree.nodes)):
            u = self.suffixTree.nodes[i]
            u.positive = 0 #n_+
            u.negative = 0 #n_-
            self.nodes[i] = u

        self.__truncate(0)

    def __truncate(self, u: int):
        for v, l in self.__traverse(u):
            if not any(c in l for c in ['0','1','$']): # case (a)
                self.__truncate(v)
            elif l[0] in ['0', '1']: # (b) or (c)
                leafs = self.__remove_with_edge(u, v, l)
                self.__increment(u, l[0], leafs)
            elif l[0] == '$': #(d)
                self.edges.pop((u, l[0]))
                self.nodes.pop(v) # not remove u!
            else:
                e = self.edges[(u, l[0])]
                sub = e.first_char_index
                for c in l:
                    if c in ['0', '1']: # (e) or (f)
                        leafs = self.__remove_from_node(v)
                        self.__increment(v, c, leafs)
                        break
                    sub += 1
                e.last_char_index = sub - 1

    def __remove_from_node(self, u: int): #remove subtree from node
        if self.nodes[u].suffix_node == -1: #speedup, leaf
            return 1
        leafs = 0
        for v, l in self.__traverse(u):
            leafs += self.__remove_from_node(v)
            self.edges.pop((u, l[0]))
            self.nodes.pop(v)

        self.nodes[u].suffix_node = -1
        return leafs

    def __remove_with_edge(self, u: int, v: int, l: Label): #remove subtree from edge
        leafs = self.__remove_from_node(v)
        self.edges.pop((u, l[0]))
        self.nodes.pop(v)

        #check if u is not leaf now
        if not self.__has_edges(u):
            self.nodes[u].suffix_node = -1

        return leafs

    def recalculate(self, u):
        if self.nodes[u].suffix_node == -1:
            return self.nodes[u].negative, self.nodes[u].positive
        negative = 0
        positive = 0

        for v, l in self.__traverse(u):
            neg, pos = self.recalculate(v)
            negative += neg
            positive += pos
        self.__increment(u, '0', negative)
        self.__increment(u, '1', positive)
        return self.nodes[u].negative, self.nodes[u].positive

    def __increment(self, u: int, type: str, leafs):
        if type == '0':
            self.nodes[u].negative += leafs
        else:
            self.nodes[u].positive += leafs

    def __has_edges(self, u: int):
        return any(self.__traverse(u))

    def __traverse(self, u):
        for c in self.alphabet:
            if (u, c) in self.edges:
                e = self.edges[(u, c)]
                l = Label(self, e.first_char_index, e.last_char_index)
                yield (e.dest_node_index, l)
Exemple #30
0
class TestSuffixTree(unittest.TestCase):

    suffix_tree = SuffixTree('abracadabra')

    def test_get_suffix_array(self):
        self.assertIn('abra', self.suffix_tree.get_suffix_array())
Exemple #31
0
def test_find_all(text_search_string_locations):
    text, search_string, locations = text_search_string_locations
    st = SuffixTree()
    st.insert_string(text)
    result = st.find_all(search_string)
    assert sorted([r[1] for r in result]) == sorted(locations)
 def test_repr(self):
     st = SuffixTree("t")
     output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \tt\n'
     import pdb
     pdb.set_trace()
     self.assertEqual(st.__repr__(), output)
import sys

out = open("output.txt", 'w')

Input = open(sys.argv[1], 'r').read().split("\n")

text = Input[0].strip()

from suffix_tree import SuffixTree
stree = SuffixTree(text)

max_len = 0
for i in stree.innerNodes:
    s = i.pathLabel
    if len(s) > max_len:
        max_len = len(s)
        longestRep = s

print >> out, longestRep
Exemple #34
0
 def build(self):
     self.suffix_tree = SuffixTree(self.corpus_str)
Exemple #35
0
 def test_repeated_string(self):
     st = SuffixTree("aaa")
     self.assertEqual(st.find_substring('a'), 0)
     self.assertEqual(st.find_substring('aa'), 0)
     self.assertEqual(st.find_substring('aaa'), 0)
     self.assertEqual(st.find_substring('b'), -1)
     self.assertTrue(st.has_substring('a'))
     self.assertTrue(st.has_substring('aa'))
     self.assertTrue(st.has_substring('aaa'))
     
     self.assertFalse(st.has_substring('aaaa'))
     self.assertFalse(st.has_substring('b'))
     #case sensitive by default
     self.assertFalse(st.has_substring('A'))
Exemple #36
0
def test_occurrances(text_search_term_insertion_count):
    text, search_term, insertion_count = text_search_term_insertion_count
    st = SuffixTree()
    st.insert_string(text)
    assert st.occurrences(search_term) == insertion_count
Exemple #37
0
 def test_long_string(self):
     f = open("test.txt")
     st = SuffixTree(f.read())
     self.assertEqual(st.find_substring('Ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 11131)
     self.assertFalse(st.has_substring('ukkonen'))
Exemple #38
0
    def test_chinese_string(self):
	st = SuffixTree(u"才高八斗")
	self.assertTrue(st.has_substring(u'高'))
	self.assertFalse(st.has_substring(u'豆豆'))
Exemple #39
0
 def test_case_sensitivity(self):
     f = open("test.txt")
     st = SuffixTree(f.read(), case_insensitive=True)
     self.assertEqual(st.find_substring('ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 1830)
Exemple #40
0
from suffix_tree import SuffixTree
import sys

File = open(sys.argv[1]).read()
#     SuffixTree
stree = SuffixTree(File)
Exemple #41
0
 def test_repr(self):
     st = SuffixTree("t")
     output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \tt\n'
     import pdb;pdb.set_trace()
     self.assertEqual(st.__repr__(), output)
Exemple #42
0
#! /usr/bin/env python

from suffix_tree import SuffixTree
import sys



if __name__ == "__main__":
    with open(sys.argv[1]) as fd:
        text = fd.readline().strip()

    tree = SuffixTree(len(text))
    for char in text:
        tree.add_char(char)

    print tree.depthFirstSearch()
    def test_repeated_string(self):
        st = SuffixTree("aaa")
        self.assertEqual(st.find_substring('a'), 0)
        self.assertEqual(st.find_substring('aa'), 0)
        self.assertEqual(st.find_substring('aaa'), 0)
        self.assertEqual(st.find_substring('b'), -1)
        self.assertTrue(st.has_substring('a'))
        self.assertTrue(st.has_substring('aa'))
        self.assertTrue(st.has_substring('aaa'))

        self.assertFalse(st.has_substring('aaaa'))
        self.assertFalse(st.has_substring('b'))
        #case sensitive by default
        self.assertFalse(st.has_substring('A'))
 def test_elements_are_reversed(self):
     tree = SuffixTree.from_seq(['a', 'a', 'b'])
     expect(tree['a']['b'].count).to(equal(1))
     expect(tree['b']['a'].count).to(equal(0))
 def test_long_string(self):
     f = open("test.txt")
     st = SuffixTree(f.read())
     self.assertEqual(st.find_substring('Ukkonen'), 1498)
     self.assertEqual(st.find_substring('Optimal'), 11131)
     self.assertFalse(st.has_substring('ukkonen'))
    def test_single_suffix_tree(self):
        tree = SuffixTree.from_seq(['a'])

        expect(tree['a'].count).to(equal(1))
        expect(tree['b'].count).to(equal(0))
        expect(tree['a']['$'].count).to(equal(1))
Exemple #47
0
from suffix_tree import SuffixTree, GST
from document import Directory
#f = open("AesopTales.txt")
Aesop=Directory()
string=Aesop.documentify("AesopTales.txt")
tree= SuffixTree(string)

# all occurences of a substring in all the stories
ip=input()
indices=tree.all_occurences(ip)
print " all occurences of",ip,": ",indices
count=0
for i in indices:
    title=''
    j=i
    for doc in Aesop.docs:
        if(i> doc.start and i< doc.end):
            title=doc.title
            j-=doc.start
    print j, title,  string[i:i+40],"\n\n"
    count+=1
print"(",count,"occurences )"

#first occurence/closest match in each story
query=input()

for doc in Aesop.docs:
    if doc.start-doc.end >=0:
	continue
    story= string[doc.start:doc.end]
    st=SuffixTree(story)