def create_gst_on_file(filename, gprint=False, strip=False): """ Opens a file and create a suffix tree on every string """ st = SuffixTree() print "Opening file \"{0}\".".format(filename) time_start = default_timer() with open(filename) as text_file: i = -1 for line in text_file: st.add_string(line.strip() if strip else line)#.strip())#.strip()) i += 1 if i % 100000 == 0: print "\tProcessed {0} elements".format(i) if gprint: g = Grapher(st) g.createGraphviz() print "Suffix tree for \"{0}\" complete in {1} seconds".format(filename, default_timer() - time_start) return st
def length_distribution_on_suffix(filename, adaptersequence): st = SuffixTree() number_of_matches = 0 length_distribution = {} #Reverse adaptersequence to create prefixtree reversed_adaptersequence = adaptersequence[::-1] st.add_string(reversed_adaptersequence) #Loop through the sequences in the file for line in generate_strings(filename): reversed_line = line[::-1] #Get longest suffix-prefix match for given string longest_match = st.find_prefixmatch_nr(reversed_line, st.root, 0.0) #Check number of matches length_match = len(longest_match) if length_match > 0: number_of_matches += 1 length_rest = len(line) - length_match if length_rest in length_distribution: length_distribution[length_rest] += 1 else: length_distribution[length_rest] = 1 return number_of_matches, length_distribution