def GSTPrechelt(s1, s2, minimalMatchingLength): """TODO: Beschreibung """ s1List = s1.split() s2List = s2.split() matches = set([]) hashList = computeHashList(s1List, minimalMatchingLength) i = 0 while (i < len(s2List)-minimalMatchingLength): #for i in xrange(len(s2List)-minimatchingLength): h = createKRHashValue(" ".join(s2List[i:i+minimalMatchingLength])) #get positions for hashvalue from string s1 positions = hashList.get(h) if positions: for pos in positions: #check if substrings are equal j = 0 while (pos+j<len(s1List) and i+j<len(s2List) and s1List[pos+j] == s2List[i+j]): j += 1 #try to extend the match if j >= minimalMatchingLength: #match matches = matches | set([(pos, i, j)]) #pos1, pos2, length # foundMatch = True else: #no match pass #foundMatch = False #insert match i += 1 #return matches as List return reduceMatches(list(matches))
def computeHashList(s, minimalMatchingLength): hashList = GSTHashtable() for i in xrange(len(s)-minimalMatchingLength): hashList.add(createKRHashValue(" ".join(s[i:i+minimalMatchingLength])), i) return hashList