def multiple_pattern_matching(text, patterns): """ CODE CHALLENGE: Solve the Multiple Pattern Matching Problem. Input: A string Text followed by a collection of strings Patterns. Output: All starting positions in Text where a string from Patterns appears as a substring. """ # to cope with not $-ending text if text[-1] != '$': text += '$' s = bwt(text) a = suffix_array(text) def scan_symbol_count(c): # update current dict current_dict[c] = current_dict.get(c, 0) + 1 # copy current dict to the current symbol array return current_dict.copy() current_dict = {} count_symbol = [{}] + [scan_symbol_count(c) for c in s] # create first_occurence def scan_first_occurence((i, c)): if c not in first_occurence: first_occurence[c] = i first_occurence = {} map(scan_first_occurence, enumerate(sorted(s))) def pattern_positions(pattern): top = 0 bottom = len(s) - 1 while top <= bottom: if pattern: symbol = pattern[-1] pattern = pattern[:-1] if count_symbol[bottom + 1].get( symbol, 0) > count_symbol[top].get(symbol, 0): top = first_occurence[symbol] + count_symbol[top].get( symbol, 0) bottom = first_occurence[symbol] + count_symbol[ bottom + 1].get(symbol, 0) - 1 else: return [] else: return [a[i] for i in range(top, bottom + 1)] return [] pos = [] for pattern in patterns: pos += pattern_positions(pattern) pos.sort() return pos
def multiple_pattern_matching(text,patterns): """ CODE CHALLENGE: Solve the Multiple Pattern Matching Problem. Input: A string Text followed by a collection of strings Patterns. Output: All starting positions in Text where a string from Patterns appears as a substring. """ # to cope with not $-ending text if text[-1] != '$': text+='$' s = bwt(text) a = suffix_array(text) def scan_symbol_count(c): # update current dict current_dict[c] = current_dict.get(c,0)+1 # copy current dict to the current symbol array return current_dict.copy() current_dict = {} count_symbol = [{}] + [scan_symbol_count(c) for c in s] # create first_occurence def scan_first_occurence((i,c)): if c not in first_occurence: first_occurence[c] = i first_occurence = {} map(scan_first_occurence,enumerate(sorted(s))) def pattern_positions(pattern): top = 0 bottom = len(s) - 1 while top <= bottom: if pattern: symbol = pattern[-1] pattern = pattern[:-1] if count_symbol[bottom+1].get(symbol,0) > count_symbol[top].get(symbol,0): top = first_occurence[symbol] + count_symbol[top].get(symbol,0) bottom = first_occurence[symbol] + count_symbol[bottom+1].get(symbol,0) - 1 else: return [] else: return [a[i] for i in range(top,bottom+1)] return [] pos = [] for pattern in patterns: pos += pattern_positions(pattern) pos.sort() return pos
def multiple_approximate_pattern_matching(text, patterns, d): """ CODE CHALLENGE: Solve the Multiple Approximate Pattern Matching Problem. Input: A string Text, followed by a collection of strings Patterns, and an integer d. Output: All positions where one of the strings in Patterns appears as a substring of Text with at most d mismatches. """ # to cope with not $-ending text if text[-1] != '$': text += '$' s = bwt(text) a = suffix_array(text) def scan_symbol_count(c): # update current dict current_dict[c] = current_dict.get(c, 0) + 1 # copy current dict to the current symbol array return current_dict.copy() current_dict = {} count_symbol = [{}] + [scan_symbol_count(c) for c in s] # create first_occurence def scan_first_occurence((i, c)): if c not in first_occurence: first_occurence[c] = i first_occurence = {} map(scan_first_occurence, enumerate(sorted(s))) # move from patterns to seeds def pattern_to_seeds(p): l = len(p) assert l > d minsize = l / (d + 1) cut = range(0, l - minsize + 1, minsize) cut.append(l) seeds = [(p[cut[i - 1]:cut[i]], cut[i - 1]) for i in range(1, len(cut))] return seeds def seed_positions(seed): top = 0 bottom = len(s) - 1 while top <= bottom: if seed: symbol = seed[-1] seed = seed[:-1] if count_symbol[bottom + 1].get( symbol, 0) > count_symbol[top].get(symbol, 0): top = first_occurence[symbol] + count_symbol[top].get( symbol, 0) bottom = first_occurence[symbol] + count_symbol[ bottom + 1].get(symbol, 0) - 1 else: return [] else: return [a[i] for i in range(top, bottom + 1)] return [] def is_approximately_matching(offset, p): mismatches = 0 for i, c in enumerate(p): if (c != text[offset + i]): mismatches += 1 if mismatches > d: return False return True def approximate_pattern_positions(p): pattern_positions = set() so = pattern_to_seeds(p) for (seed, offset) in so: candidate_positions = seed_positions(seed) for candidate_position in candidate_positions: pattern_position = candidate_position - offset if pattern_position < 0: # candidate matching before text starts .... continue if pattern_position + len(p) > len(text): # candidate matching after text stops .... continue if is_approximately_matching(pattern_position, p): # candidate matching with at most d mismatches pattern_positions.add(pattern_position) return list(pattern_positions) pos = [] for pattern in patterns: pos += approximate_pattern_positions(pattern) pos.sort() return pos
def multiple_approximate_pattern_matching(text,patterns,d): """ CODE CHALLENGE: Solve the Multiple Approximate Pattern Matching Problem. Input: A string Text, followed by a collection of strings Patterns, and an integer d. Output: All positions where one of the strings in Patterns appears as a substring of Text with at most d mismatches. """ # to cope with not $-ending text if text[-1] != '$': text+='$' s = bwt(text) a = suffix_array(text) def scan_symbol_count(c): # update current dict current_dict[c] = current_dict.get(c,0)+1 # copy current dict to the current symbol array return current_dict.copy() current_dict = {} count_symbol = [{}] + [scan_symbol_count(c) for c in s] # create first_occurence def scan_first_occurence((i,c)): if c not in first_occurence: first_occurence[c] = i first_occurence = {} map(scan_first_occurence,enumerate(sorted(s))) # move from patterns to seeds def pattern_to_seeds(p): l = len(p) assert l>d minsize = l/(d+1) cut = range(0,l-minsize+1,minsize) cut.append(l) seeds = [(p[cut[i-1]:cut[i]],cut[i-1]) for i in range(1,len(cut))] return seeds def seed_positions(seed): top = 0 bottom = len(s) - 1 while top <= bottom: if seed: symbol = seed[-1] seed = seed[:-1] if count_symbol[bottom+1].get(symbol,0) > count_symbol[top].get(symbol,0): top = first_occurence[symbol] + count_symbol[top].get(symbol,0) bottom = first_occurence[symbol] + count_symbol[bottom+1].get(symbol,0) - 1 else: return [] else: return [a[i] for i in range(top,bottom+1)] return [] def is_approximately_matching(offset,p): mismatches = 0 for i,c in enumerate(p): if (c!=text[offset+i]): mismatches += 1 if mismatches > d: return False return True def approximate_pattern_positions(p): pattern_positions = set() so = pattern_to_seeds(p) for (seed,offset) in so: candidate_positions = seed_positions(seed) for candidate_position in candidate_positions: pattern_position = candidate_position - offset if pattern_position < 0: # candidate matching before text starts .... continue if pattern_position + len(p) > len(text): # candidate matching after text stops .... continue if is_approximately_matching(pattern_position,p): # candidate matching with at most d mismatches pattern_positions.add(pattern_position) return list(pattern_positions) pos = [] for pattern in patterns: pos += approximate_pattern_positions(pattern) pos.sort() return pos