def multipleApproximatePatternMatching(text,patterns,d): lastC = BWT.BWT(text) firstOccur = BWTMatch.firstOccurrence(lastC) partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K) checkpointarray = mpm.checkPointArray(lastC, C) suffixarray,suffixes = mpm.suffixArray(text) positions = [] for pattern in patterns: '''seed preparation:divide pattern into d + 1 substring, with the length floor(n/(d+1))''' n = len(pattern) k = n / (d+1) #print "pattern",pattern,"n",n,"k",k,"d",d # for each pattern, if we hits same position in the text multiple times, we only keep one record. posstarts = [] for i in range(d+1): if i == d: pat = pattern[i*k:] else: pat = pattern[i*k:(i+1)*k] '''seed detection: for each seed (pat), do exact pattern matching to find which seeds match Text exactly''' position = mpm.PatternMatching(suffixarray,firstOccur,lastC,checkpointarray,partialsuffixarray,pat) if position is not None: for pos in position: '''seed extension: extend seeds in both directions to verify whether Pattern occurs in Text with at most d mismatches.''' posstart = pos-k*i dist = hamming_distance(text[posstart:posstart+n],pattern) if dist is not None and dist <= d: posstarts.append(posstart) positions.extend(list(set(posstarts))) return positions
def multiplePatternMatching(text, patterns): """Input that I can use: LastColumn, FirstOccurrence, CheckPointArray, PartialSuffixArray""" lastC = BWT.BWT(text) firstOccur = BWTMatch.firstOccurrence(lastC) partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K) checkpointarray = checkPointArray(lastC, C) suffixarray,suffixes = suffixArray(text) positions = [] for pattern in patterns: position = PatternMatching(suffixarray,firstOccur,lastC,checkpointarray,partialsuffixarray,pattern) if position is not None: for pos in position: positions.append(pos) return positions
def multiplePatternMatching(text, patterns): """Input that I can use: LastColumn, FirstOccurrence, CheckPointArray, PartialSuffixArray""" lastC = BWT.BWT(text) firstOccur = BWTMatch.firstOccurrence(lastC) partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K) checkpointarray = checkPointArray(lastC, C) suffixarray, suffixes = suffixArray(text) positions = [] for pattern in patterns: position = PatternMatching(suffixarray, firstOccur, lastC, checkpointarray, partialsuffixarray, pattern) if position is not None: for pos in position: positions.append(pos) return positions
def multipleApproximatePatternMatching(text, patterns, d): lastC = BWT.BWT(text) firstOccur = BWTMatch.firstOccurrence(lastC) partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K) checkpointarray = mpm.checkPointArray(lastC, C) suffixarray, suffixes = mpm.suffixArray(text) positions = [] for pattern in patterns: '''seed preparation:divide pattern into d + 1 substring, with the length floor(n/(d+1))''' n = len(pattern) k = n / (d + 1) #print "pattern",pattern,"n",n,"k",k,"d",d # for each pattern, if we hits same position in the text multiple times, we only keep one record. posstarts = [] for i in range(d + 1): if i == d: pat = pattern[i * k:] else: pat = pattern[i * k:(i + 1) * k] '''seed detection: for each seed (pat), do exact pattern matching to find which seeds match Text exactly''' position = mpm.PatternMatching(suffixarray, firstOccur, lastC, checkpointarray, partialsuffixarray, pat) if position is not None: for pos in position: '''seed extension: extend seeds in both directions to verify whether Pattern occurs in Text with at most d mismatches.''' posstart = pos - k * i dist = hamming_distance(text[posstart:posstart + n], pattern) if dist is not None and dist <= d: posstarts.append(posstart) positions.extend(list(set(posstarts))) return positions