def construct_partial_suffix_array(word, k): '''Constructs a suffix array from the given word.''' # Check that the word ends in the out of alphabet character '$'. word += ['', '$'][word[-1] != '$'] # Construct the suffix array for the given word. suffix_array = construct_suffix_array(word) # Return the position and value of elements that are multiples of k. return [(i, s) for i, s in enumerate(suffix_array) if s % k == 0]
def get_multi_pattern_count(word, patterns): '''Precomputes the necessary information and passes each pattern to multiple pattern matching function.''' # Construct the Burrows-Wheeler Transform and Suffix Array. bwt = burrows_wheeler_transform(word) suffix_array = construct_suffix_array(word) # Create the count dictionary. symbols = set(bwt) current_count = {ch:0 for ch in symbols} count = {0:{ch:current_count[ch] for ch in symbols}} for i in xrange(len(bwt)): current_count[bwt[i]] += 1 count[i+1] = {ch:current_count[ch] for ch in symbols} # Get the index of the first occurrence of each character in the sorted Burrows-Wheeler Transformation. sorted_bwt = sorted(bwt) first_occurrence = {ch:sorted_bwt.index(ch) for ch in set(bwt)} # Pass the information and patters along to the BWMatching algorithm. matches = [] for pattern in patterns: matches += multi_pattern_match_bw(bwt, suffix_array, first_occurrence, count, pattern) return matches
def get_multi_pattern_count(word, patterns): '''Precomputes the necessary information and passes each pattern to multiple pattern matching function.''' # Construct the Burrows-Wheeler Transform and Suffix Array. bwt = burrows_wheeler_transform(word) suffix_array = construct_suffix_array(word) # Create the count dictionary. symbols = set(bwt) current_count = {ch: 0 for ch in symbols} count = {0: {ch: current_count[ch] for ch in symbols}} for i in xrange(len(bwt)): current_count[bwt[i]] += 1 count[i + 1] = {ch: current_count[ch] for ch in symbols} # Get the index of the first occurrence of each character in the sorted Burrows-Wheeler Transformation. sorted_bwt = sorted(bwt) first_occurrence = {ch: sorted_bwt.index(ch) for ch in set(bwt)} # Pass the information and patters along to the BWMatching algorithm. matches = [] for pattern in patterns: matches += multi_pattern_match_bw(bwt, suffix_array, first_occurrence, count, pattern) return matches
def multi_approx_pattern_match(word, patterns, d): '''Returns the starting indices of all approximate matches to the given list of patterns using the seed method.''' # Construct the Burrows-Wheeler Transform and Suffix Array. bwt = burrows_wheeler_transform(word) suffix_array = construct_suffix_array(word) # Create the count dictionary. symbols = set(bwt) current_count = {ch:0 for ch in symbols} count = {0:{ch:current_count[ch] for ch in symbols}} for i in xrange(len(bwt)): current_count[bwt[i]] += 1 count[i+1] = {ch:current_count[ch] for ch in symbols} # Get the index of the first occurrence of each character in the sorted Burrows-Wheeler Transformation. sorted_bwt = sorted(bwt) first_occurrence = {ch:sorted_bwt.index(ch) for ch in set(bwt)} # Detect and extend seeds to find the approximate pattern locations. matches = [] for num, pattern in enumerate(patterns): seed_locations = seed_detection(bwt, suffix_array, first_occurrence, count, word, pattern, d) matches += [seed_index for seed_index in seed_locations if seed_extension(word, pattern, seed_index, d) is True] return matches