Example #1
0
	def calc_word_distance(self, str1, str2):
		""" Compares the two strings using Damerau-Levenshtein distance.
		"""
		# strip non alpha-numeric
		str1 = re.sub('[^a-z0-9]', lambda x: ' ', str1.lower())
		str2 = re.sub('[^a-z0-9]', lambda x: ' ', str2.lower())

		# cleanup spaces
		str1 = re.sub(' {2,}', lambda x: ' ', str1)
		str2 = re.sub(' {2,}', lambda x: ' ', str2)

		distance = 0
		if len(str2) > len(str1):
			distance = StringUtil.damerau_levenshtein(str1, str2[:len(str1)])

		# this extra check is to "weight" the values against the full string...
		# for instance, given str1="abc" and str2="abc but there is more", we want this to return a higher distance
		# than if given str1="abc but" and str2="abc but there is more"
		distance += StringUtil.damerau_levenshtein(str1, str2)

		# TODO might want to do an additional check on the exact initial strings, since the stripping we do would
		# cause "ab-cd" and "ab.cd" to match exactly the same, which would be bad
		return distance