def multipleApproximatePatternMatching(text,patterns,d):
	lastC = BWT.BWT(text)
	firstOccur = BWTMatch.firstOccurrence(lastC)

	partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K)
	checkpointarray = mpm.checkPointArray(lastC, C)
	suffixarray,suffixes = mpm.suffixArray(text)

	positions = []
	for pattern in patterns:
		'''seed preparation:divide pattern into d + 1 substring, with the length floor(n/(d+1))'''
		n = len(pattern)
		k = n / (d+1)
		#print "pattern",pattern,"n",n,"k",k,"d",d
		# for each pattern, if we hits same position in the text multiple times, we only keep one record.
		posstarts = []
		for i in range(d+1):
			if i == d:
				pat = pattern[i*k:]
			else:
				pat = pattern[i*k:(i+1)*k]
			'''seed detection: for each seed (pat), do exact pattern matching to find which seeds match Text exactly'''
			position = mpm.PatternMatching(suffixarray,firstOccur,lastC,checkpointarray,partialsuffixarray,pat)
			
			if position is not None:
				for pos in position:
					'''seed extension: extend seeds in both directions to verify whether Pattern occurs in Text with at most d mismatches.'''
					posstart = pos-k*i
					dist = hamming_distance(text[posstart:posstart+n],pattern)
					if dist is not None and dist <= d:
						posstarts.append(posstart)
		positions.extend(list(set(posstarts)))
	return positions
def multiplePatternMatching(text, patterns):
	"""Input that I can use: LastColumn, FirstOccurrence, CheckPointArray, PartialSuffixArray"""
	lastC = BWT.BWT(text)
	firstOccur = BWTMatch.firstOccurrence(lastC)

	partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K)
	checkpointarray = checkPointArray(lastC, C)
	suffixarray,suffixes = suffixArray(text)

	positions = []
	for pattern in patterns:
		position = PatternMatching(suffixarray,firstOccur,lastC,checkpointarray,partialsuffixarray,pattern)
		if position is not None:
			for pos in position:
				positions.append(pos)
	return positions
Example #3
0
def multiplePatternMatching(text, patterns):
    """Input that I can use: LastColumn, FirstOccurrence, CheckPointArray, PartialSuffixArray"""
    lastC = BWT.BWT(text)
    firstOccur = BWTMatch.firstOccurrence(lastC)

    partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K)
    checkpointarray = checkPointArray(lastC, C)
    suffixarray, suffixes = suffixArray(text)

    positions = []
    for pattern in patterns:
        position = PatternMatching(suffixarray, firstOccur, lastC,
                                   checkpointarray, partialsuffixarray,
                                   pattern)
        if position is not None:
            for pos in position:
                positions.append(pos)
    return positions
def multipleApproximatePatternMatching(text, patterns, d):
    lastC = BWT.BWT(text)
    firstOccur = BWTMatch.firstOccurrence(lastC)

    partialsuffixarray = partialSuffixArray.partialSuffixArray(text, K)
    checkpointarray = mpm.checkPointArray(lastC, C)
    suffixarray, suffixes = mpm.suffixArray(text)

    positions = []
    for pattern in patterns:
        '''seed preparation:divide pattern into d + 1 substring, with the length floor(n/(d+1))'''
        n = len(pattern)
        k = n / (d + 1)
        #print "pattern",pattern,"n",n,"k",k,"d",d
        # for each pattern, if we hits same position in the text multiple times, we only keep one record.
        posstarts = []
        for i in range(d + 1):
            if i == d:
                pat = pattern[i * k:]
            else:
                pat = pattern[i * k:(i + 1) * k]
            '''seed detection: for each seed (pat), do exact pattern matching to find which seeds match Text exactly'''
            position = mpm.PatternMatching(suffixarray, firstOccur, lastC,
                                           checkpointarray, partialsuffixarray,
                                           pat)

            if position is not None:
                for pos in position:
                    '''seed extension: extend seeds in both directions to verify whether Pattern occurs in Text with at most d mismatches.'''
                    posstart = pos - k * i
                    dist = hamming_distance(text[posstart:posstart + n],
                                            pattern)
                    if dist is not None and dist <= d:
                        posstarts.append(posstart)
        positions.extend(list(set(posstarts)))
    return positions