def test_find_5 (self, _, builder): tree = Tree ({ 'A' : 'aaaaa', 'B' : 'bbbb', }, builder = builder) self.assertEqual (len (tree.find_all ('a')), 5) self.assertEqual (len (tree.find_all ('b')), 4)
def test_find_3 (self, _, builder): # Gusfield1997 Figure 7.1 Page 129 tree = Tree ({ 'A' : 'xyxaxaxa' }, builder = builder) self.assertTrue (tree.find ('xyxaxaxa')) self.assertTrue (tree.find ('xax')) self.assertTrue (tree.find ('axa')) self.assertFalse (tree.find ('ay'))
def generate_maximal_repeats_sequence(df, tictoc): tictoc.tic() df_tree = df.event.value_counts().reset_index() #Count variants and then duplicate variants with count > 1 #This way we can creat a tree based on variants instead of adding each trace #When count > 1 it is sufficient to have only two variants for maximal repeats to be discovered df_tree.columns = ['event', 'count'] to_duplicate = df_tree['count'] > 1 df_to_duplicate = df_tree[to_duplicate] df_tree = df_tree.append([df_to_duplicate], ignore_index=True) dict_tree = {} for index, row in df_tree.iterrows(): dict_tree[index] = row['event'].split(" ") tree = Tree(dict_tree) maximal_repeats_sequence = [] for C, path in sorted(tree.maximal_repeats()): sfx = str(path) # Only repeats with two or more elements if len(sfx.split(' ')) > 0: maximal_repeats_sequence.append(sfx) print("Suffix Tree generated.\t Processing Time: %.2fs" % (tictoc.toc())) return maximal_repeats_sequence
def test_to_dot(self): # tree = Tree ({ 'A' : 'aaaaa' }) # tree = Tree ({ 'A' : 'ababcabcdabdeaef' }) tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'}) dot = tree.to_dot() with open('/tmp/suffix_tree.dot', 'w') as tmp: tmp.write(dot) self.assertTrue(1 == 1) # just test that it doesn't bomb
def main(): # load all the files into a dictionary data where the keys are the names of the file path = '' data = {} for filename in glob.glob(os.path.join(path, 'sample.*')): with open(os.path.join(os.getcwd(), filename), 'rb') as f: data[filename] = f.read() print(filename) # length of each file ''' for i in data.values(): print(len(list(i))) ''' # create a generalized suffix tree by passing in data dict containing file names and content t2 = Tree(data) # as shown below the: # length of the longest common byte substring is 27648 # the files are sample.3 and sample.2 # the starting offset for the files are 17408, 3072 result = longest(t2.root) print('the length of the longest common substring(strand):', result[0][0]) print('the files containing the strings are :', result[1]) print('the starting offset in the respective files are:', result[2])
def melody_identifier(pm, note_shift): notes_dict = get_notes_dict(pm) notes_int = get_notes_dict_int(pm) tree = Tree(notes_dict) paths = grab_paths(tree) results = filter_paths(paths) weighted_seq = construct_weighted_seq(results, note_shift) return get_melody_instrument(weighted_seq, notes_int)
def get_BFMT(file=INPUT, n_docs=N_DOCS): docs = pd.read_csv(file, header=None, engine='python', nrows=n_docs)[5] docs = docs.apply(lambda x: x.lower()) docs = docs.apply( lambda x: ''.join([i if ord(i) < 128 else ' ' for i in x])) docs = list(docs) tree = Tree(dict(enumerate(docs))) return tree, docs
def test_1(self): """ Test string common among 2 strings """ tree = Tree({'1': 'xabzxa', '2': 'yabz'}) result = longest(tree.root) self.assertEqual(result[0][0], 3) self.assertEqual(result[1][0], ['1', '2']) self.assertEqual(result[2][0], [1, 1])
def test_2(self): """ Test string common among 2 strings where the string 1 """ tree = Tree({'1': 'axabxzaxabx', '2': 'ybxzxda'}) result = longest(tree.root) self.assertEqual(result[0][0], 3) self.assertEqual(result[1][0], ['1', '2']) self.assertEqual(result[2][0], [3, 1])
def test_3(self): """ Test string common among 3 strings """ tree = Tree({'1': 'xabzxabxab', '2': 'yabzdxabx', "3": "zavxabx"}) result = longest(tree.root) self.assertEqual(result[0][0], 4) self.assertEqual(result[1][0], ['1', '2', '3']) self.assertEqual(result[2][0], [4, 5, 3])
def test_lca(self): tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'}) tree.prepare_lca() self.assertEqual( tree.lca(tree.nodemap['A'][1], tree.nodemap['B'][3]).id, 8) self.assertEqual( tree.lca(tree.nodemap['A'][0], tree.nodemap['B'][8]).id, 2) self.assertEqual( tree.lca(tree.nodemap['B'][1], tree.nodemap['B'][7]).id, 19) self.assertEqual( tree.lca(tree.nodemap['A'][0], tree.nodemap['B'][7]).id, 1)
def test_find_7 (self, _, builder): tree = Tree ({ 'A' : 'abcde', 'B' : 'bcde', 'C' : 'cde', 'D' : 'de', 'E' : 'e', }, builder = builder) self.assertEqual (len (tree.find_all ('abcde')), 1) self.assertEqual (len (tree.find_all ('bcde')), 2) self.assertEqual (len (tree.find_all ('cde')), 3) self.assertEqual (len (tree.find_all ('de')), 4) self.assertEqual (len (tree.find_all ('e')), 5)
def test_find_6 (self, _, builder): tree = Tree ({ 'A' : 'a', 'B' : 'ab', 'C' : 'abc', 'D' : 'abcd', 'E' : 'abcde', }, builder = builder) self.assertEqual (len (tree.find_all ('abcde')), 1) self.assertEqual (len (tree.find_all ('abcd')), 2) self.assertEqual (len (tree.find_all ('abc')), 3) self.assertEqual (len (tree.find_all ('ab')), 4) self.assertEqual (len (tree.find_all ('a')), 5)
def test_maximal_repeats_2(self, _, builder): tree = Tree( { 'A': '1abc2abc3', 'B': '4abc5abc6', 'C': '7abc', 'D': '7abc', 'E': 'abc', }, builder=builder) self.maximal_repeats(tree, [ '2 7 a b c', '5 a b c', ])
def test_find_4 (self, _, builder): tree = Tree ({ 'A' : ('232 020b 092 093 039 061 102 135 098 099 ' '039 040 039 040 044 141 140 098').split (), 'B' : '097 098 039 040 041 129 043'.split (), 'C' : ('097 098 039 040 020a 022 023 097 095 094 098 ' '043 044 112 039 020b 039 098').split (), }, builder = builder) self.assertTrue (tree.find ('039 040 041'.split ())) self.assertTrue (tree.find ('039 040 039 040'.split ())) self.assertTrue (tree.find ('020a 022 023'.split ())) self.assertTrue (tree.find ('232 020b 092'.split ())) self.assertTrue (tree.find ('097 098 039 040'.split ())) self.assertTrue (tree.find ('141 140 098'.split ())) self.assertFalse (tree.find ('039 040 042'.split ()))
def test_find_1 (self, _, builder): # Gusfield1997 Figure 5.1 Page 91 tree = Tree ({ 'A' : 'xabxac' }, builder = builder) self.assertTrue (tree.find ('x')) self.assertTrue (tree.find ('xa')) self.assertTrue (tree.find ('xab')) self.assertTrue (tree.find ('xabx')) self.assertTrue (tree.find ('xabxa')) self.assertTrue (tree.find ('xabxac')) self.assertTrue (tree.find ('abxac')) self.assertTrue (tree.find ('bxac')) self.assertTrue (tree.find ('xac')) self.assertTrue (tree.find ('ac')) self.assertTrue (tree.find ('c')) self.assertFalse (tree.find ('d')) self.assertFalse (tree.find ('xx')) self.assertFalse (tree.find ('xabxaa'))
class LongestPalindromeFinder(tk.Frame): def __init__(self, master=None): super().__init__(master=master) tk.Label(self, text='Sequence').grid(row=0, column=0, sticky='w') self.sequence_text = tk.Text(self) self.sequence_text.grid(row=1, column=0, columnspan=2, sticky='nsew') self.sequence_load_btn = tk.Button( self, text='Choose file', command=self.choose_sequence_file) self.sequence_load_btn.grid(row=2, column=1, sticky='e') buttons_frame = tk.Frame(self) self.run_btn = tk.Button(master=buttons_frame, text='Run', command=self.run) self.run_btn.grid(row=0, column=1) self.export_btn = tk.Button( master=buttons_frame, text='Export Tree', command=self.export_tree) self.export_btn.grid(row=0, column=0) buttons_frame.grid(row=3, column=0, columnspan=2, sticky='e') for i in range(2): self.grid_columnconfigure(i, weight=1) for i in range(4): self.grid_rowconfigure(i, weight=1) self.sequence = None self.tree = None def choose_sequence_file(self): filename = filedialog.askopenfilename(parent=self) try: self.sequence_text.delete(1.0, tk.END) self.sequence_text.insert(tk.END, open(filename, 'r').read()) except FileNotFoundError: messagebox.showwarning( title='Bad file', message='No files selected') def load_sequence(self): self.sequence = self.sequence_text.get(1.0, tk.END).strip("\n ") def construct_tree(self): self.tree = Tree({1: self.sequence, 2: reversed(self.sequence)}) def initialize(self): self.load_sequence() if not self.sequence: messagebox.showerror( title='Bad input', message='Invalid input sequence') return self.construct_tree() def run(self): self.initialize() result = str(self.find_the_longest_palindrome()).replace(' ', '') output(result, 'longest_palindrome_found.txt') def build_neighbor_leaves(self): leaves = {} def f(node): if node.is_leaf(): leaves[(node.str_id, node.path.start)] = node self.tree.root.post_order(f) odd_neighbor_leaves = [(leaves[1, i], leaves[2, len(self.sequence) - 1 - i]) for i in range(len(self.sequence))] even_neighbor_leaves = [(self.sequence[i], leaves[1, i + 1], leaves[2, len(self.sequence) + 1 - i]) for i in range(1, len(self.sequence) - 1) if self.sequence[i - 1] == self.sequence[i]] # neighbor_leaves = [(leaves[i, 0], leaves[2, 0])] return odd_neighbor_leaves, even_neighbor_leaves def find_the_longest_odd_palindrome(self, neighbor_leaves): candidate_node = None for node1, node2 in neighbor_leaves: node = self.tree.lca(node1, node2) if candidate_node is None or len(node) > len(candidate_node): candidate_node = node result = str(candidate_node).replace(' ', '') result = (result[::-1])[:-1] + result return result def find_the_longest_even_palindrome(self, neighbor_leaves): if not neighbor_leaves: return '' candidate_node = None middle = None for mid, node1, node2 in neighbor_leaves: node = self.tree.lca(node1, node2) if candidate_node is None or len(node) > len(candidate_node): candidate_node = node middle = mid if candidate_node == self.tree.root: candidate_node = '' result = str(candidate_node).replace(' ', '') result = result[::-1] + (middle * 2) + result return result def find_the_longest_palindrome(self): odd_neighbor_leaves, even_neighbor_leaves = self.build_neighbor_leaves() self.tree.prepare_lca() odd_palindrome = self.find_the_longest_odd_palindrome(odd_neighbor_leaves) even_palindrome = self.find_the_longest_even_palindrome(even_neighbor_leaves) return max(odd_palindrome, even_palindrome, key=len) def export_tree(self): self.initialize() filename = filedialog.asksaveasfilename(parent=self.master) graphviz.Source(self.tree.to_dot()).render( filename=filename, format='pdf', view=True, cleanup=True)
# def pysequitur_map(str_seq_unique, win_len): # print(str_seq_unique[0:20]) # total_str = np.concatenate([str_seq_unique[0:-win_len//2:-1], str_seq_unique, str_seq_unique[-win_len//2:-1:-1]]).tolist() # print(len(str_seq_unique)) # print(len(total_str)) # # for i in range(win_len//2, len(total_str)-win_len//2): # temp_str = total_str[i-win_len//2:i+win_len//2] # print(temp_str) # s_temp = psq.Sequencer2(temp_str) # print(s_temp.get()) # print(s_temp.resolve()) # psq.print_grammar(s_temp) suffixtree = Tree({"A": ch1_word_lst[:20]}) suffixtree.prepare_lca() from graphviz import Source dot_object = suffixtree.to_dot() s = Source(dot_object) s.render("test.gv", view=True) # pysequitur_map(ch1_word_lst, 20) ch1_clean_str = removeShortSegments(ch1_connotated_lst_unique.split(" "), ch1_count_lst) ch1_clean_str_unique = runLengthEncoding(ch1_clean_str) # print(ch1_connotated_lst)
def test_maximal_repeats_1(self, _, builder): # See [Gusfield1997]_ §7.12.1, 144ff. tree = Tree({'A': 'xabxac', 'B': 'awyawxawxz'}, builder=builder) self.maximal_repeats(tree, [ '2 x a', ])
def test_find_2 (self, _, builder): # Gusfield1997 Figure 5.2 Page 92 tree = Tree ({ 'A' : 'awyawxawxz' }, builder = builder) self.assertTrue (tree.find ('awx')) self.assertTrue (tree.find ('awy')) self.assertFalse (tree.find ('awz'))
def construct_tree(self): self.tree = Tree({1: self.sequence, 2: reversed(self.sequence)})
def test_maximal_repeats_3(self, _, builder): # pylint: disable=line-too-long tree = Tree( { 'berlin-sb-lat-qu-931': ['157', '165', '167', '165'], 'cava-dei-tirreni-bdb-4': [ '232', '020b', '092', '093', '039', '061', '102', '135', '098', '099', '039', '040', '039', '040', '044', '141', '140', '098', '121', '098', '090', '095', '134', '135', '088', '104', '139', '140', '067', '091', '094', '158', '159', '161', '162', '163', '164', '165', '158', '165', '161' ], 'gotha-flb-memb-i-84': [ 'A000', '176', '039', '040', '020a', '022', '023', '097', '095', '094', '098', '043', '044', '112', '039', '020b', '039', '098', '140', '141', '138', '139', 'A000', '196', '112', '105', '134', '157', '166', '163', '175', '165', '191', '192', '193', '209', '210', '228', '213', '212', '216', '217' ], 'heiligenkreuz-sb-217': [ '249', '068', '134', '157', '165', '167', '201', '165', '188', '191', '193', '188', '192', '016', '015', '013', '055', '056', '042', '052', '066', '020a', '078', '191', '131', '192', '195', '266', '242', '293', '273b', '287', '094', '040', '020a', '023', '095', '094', '098', '020b', '157', '163', '165', '196' ], 'ivrea-bc-xxxiii': [ '068', '020a', '022', '023', '095', '094', '021', '097', '098', '039', '040', '039', '040', '041', '129', '043', '044', '112', '088', '104', '134', '135', '139' ], 'ivrea-bc-xxxiv': [ '022', '020a', '022', '023', '095', '094', '021', '097', '098', '039', '040', '039', '040', '041', '129', '043', '098', '099', '101', '100', '020a', '044', '112', '088', '134', '135', '060', '140', '141', '138', '139', '157', '165', '164', '163', '093', '201', '159' ], 'modena-bc-o-i-2': [ '163', '039', '040', '020a', '022', '023', '097', '095', '094', '098', '043', '044', '112', '039', '020b', '039', '098', '157', '163', '165', '191', '192', '193', '196', '216', '217' ], 'muenchen-bsb-lat-19416': [ '020a', '093', '020a', '022', '023', '095', '046', '067', '047', '039', '040', '039', '040', '041', '129', '043', '044', '112', '099', '097', '098', '103', '093', '105', '191', '192', '193', '201' ], 'muenchen-bsb-lat-29555-1': [ 'M024', '163', '175', '165', 'M015', '196', '193', '209', '210', '228', '273', '224' ], 'muenchen-bsb-lat-3853': [ '249', '068', '134', '157', '165', '167', '201', '165', '188', '191', '193', '188', '192', '016', '015', '013', '055', '056', '042', '052', '066', '020a', '078', '191', '131', '192', '195', '252a', '266', '242', '293', '273b', '287', '094', '040', '020a', '023', '095', '094', '098', '020b', '157', '163', '165', '196' ], 'muenchen-bsb-lat-6360': ['157', '165', '167', '201', '165'], 'paris-bn-lat-3878': [ '249', '157', '165', '167', '188', '191', '293', '165', '196', '252a' ], 'paris-bn-lat-4613': [ '020a', '094', '095', '096', '025', '098', '039', '040', '022', '023', '105', '033', '121', '141', '191', '192', '158', '165', '201', '215', '214', '219' ], 'salzburg-bea-st-peter-a-ix-32': [ '252', '191', '201', 'M023', '215', '210', '093', '112', '293', '252a', '010', '011', '293' ], 'st-gallen-sb-733': ['020a', '089', '022', '023', '097', '094'], 'st-paul-abs-4-1': [ '092', '093', '020a', '094', '095', '088', '044', '046', '067', '098', '099', '041', '043', '044', '039', '098', '134', '090', '113', '139', '140', '141', '140', '139', '158', '140', '165', '181' ], 'vatikan-bav-chigi-f-iv-75': [ '232', '020b', '092', '093', '039', '061', '102', '135', '098', '099', '039', '040', '039', '040', '044', '141', '140', '098', '090', '134', '135', '088', '104', '139', '140', '067', '091', '094', '095', '158', '159', '098', '159', '162', '163', '164', '165', '161', '093', '201', '020a', '095', '088', '044', '046', '099', '041', '043', '044', '141', '158', '208' ], 'vatikan-bav-reg-lat-1000b': ['157', '165'], 'vatikan-bav-reg-lat-263': [ '202', '087', '044', '224', 'A000', '163', '093', '224', '225', '232', '020b' ], 'vatikan-bav-vat-lat-5359': ['201', '165'], 'vercelli-bce-clxxiv': ['167', '092', '093', '165', '163', '166'], 'vercelli-bce-clxxv': ['163'], 'wien-oenb-ser-n-3761': ['157'], 'wolfenbuettel-hab-blankenb-130': [ '022', '043', '044', '060', '103', '097', '095', '067', '040', '020a', '112', '038', '023', '094', '041', '020a', '105', '098', '057', '014', '114', '039', '040', '039', '040', '139', '140', '156', '178', '143', '150', '138', '179', '174', '157', '160', '165', '228', '164', '163', '093', '166', '159', '102', '201', '158', '161', '093', '191', '192', '193', '201', '158', '180', '210', '211', '214', '219', '202', '022', '023', '095', '098', '041', '129', '043', '044', '094', '215', '131' ] }, builder=builder) self.maximal_repeats(tree, [ '4 022 023 095', '4 023 095 094', '4 041 129 043', '4 043 044 112', '4 092 093', '4 095 094 098', '4 098 099', '4 134 135', '4 139 140', '4 191 192 193', '4 201 165', '5 020a 022 023', '5 040 020a', '5 044 112', '5 157 165 167', '5 191 192', '6 023 095', '6 039 040 039 040', '6 095 094', '7 043 044', '7 157 165', '8 022 023', '9 039 040', ], min_cv=4)