-
Notifications
You must be signed in to change notification settings - Fork 9
/
Code6_15_MultipleApproximatePatternMatching.py
70 lines (59 loc) · 2.11 KB
/
Code6_15_MultipleApproximatePatternMatching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
__author__ = "Chunyu Zhao"
__date__ = "20160205"
import sys, BWT, BetterBWTMatching as BWTMatch,MultiplePatternMatching as mpm,partialSuffixArray
C = 5
K = 5
''' patterns are not always the same length'''
def hamming_distance(str1,str2):
hd = 0
minlen = min(len(str1),len(str2))
maxlen = max(len(str1),len(str2))
for i in range(minlen):
if str1[i] != str2[i]:
hd += 1
hd += maxlen-minlen
return hd
def multipleApproximatePatternMatching(text,patterns,d):
lastC = BWT.BWT(text)
firstOccur = BWTMatch.firstOccurrence(lastC)
partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K)
checkpointarray = mpm.checkPointArray(lastC, C)
suffixarray,suffixes = mpm.suffixArray(text)
positions = []
for pattern in patterns:
'''seed preparation:divide pattern into d + 1 substring, with the length floor(n/(d+1))'''
n = len(pattern)
k = n / (d+1)
#print "pattern",pattern,"n",n,"k",k,"d",d
# for each pattern, if we hits same position in the text multiple times, we only keep one record.
posstarts = []
for i in range(d+1):
if i == d:
pat = pattern[i*k:]
else:
pat = pattern[i*k:(i+1)*k]
'''seed detection: for each seed (pat), do exact pattern matching to find which seeds match Text exactly'''
position = mpm.PatternMatching(suffixarray,firstOccur,lastC,checkpointarray,partialsuffixarray,pat)
if position is not None:
for pos in position:
'''seed extension: extend seeds in both directions to verify whether Pattern occurs in Text with at most d mismatches.'''
posstart = pos-k*i
dist = hamming_distance(text[posstart:posstart+n],pattern)
if dist is not None and dist <= d:
posstarts.append(posstart)
positions.extend(list(set(posstarts)))
return positions
if __name__ == '__main__':
if len(sys.argv) == 2:
filename = sys.argv[1]
with open(filename) as f:
lines = f.read().splitlines()
text = lines[0]+"$"
patterns = lines[1].split(' ')
d = int(lines[2])
else:
text = "ACATGCTACTTT$"
patterns = ["ATT","GCC","GCTA","TATT"]
d = 1
positions = multipleApproximatePatternMatching(text, patterns,d)
print ' '.join(map(str,positions))