def test_edit_distance(self): self.assertEqual(edit_distance('', 'aa'), 2) self.assertEqual(edit_distance('aa', ''), 2) self.assertEqual(edit_distance('a', 'ab'), 1) self.assertEqual(edit_distance('ab', 'a'), 1) self.assertEqual(edit_distance('ab', 'aa'), 1) self.assertEqual(edit_distance('aa', 'ab'), 1) self.assertEqual(edit_distance('abd', 'abcdef'), 3) self.assertEqual(edit_distance('abcdef', 'abd'), 3)
def contributors_by_fuzzy_match(self, string): string_in_lowercase = string.lower() # 1. Exact match for fullname, email and irc_nicknames account = ( self.contributor_by_name(string_in_lowercase) or self.account_by_email(string_in_lowercase) or self.contributor_by_irc_nickname(string_in_lowercase) ) if account: return [account], 0 # 2. Exact match for email username (before @) accounts = self.contributors_by_email_username(string_in_lowercase) if accounts and len(accounts) == 1: return accounts, 0 # 3. Exact match for first name, last name, and first name + initial combinations such as "Dan B" and "Tim H" accounts = [ contributor for contributor in self.contributors() if string in self._contributor_name_shorthands(contributor) ] if accounts and len(accounts) == 1: return accounts, 0 # 4. Finally, fuzzy-match using edit-distance string = string_in_lowercase contributorWithMinDistance = [] minDistance = len(string) / 2 - 1 for contributor in self.contributors(): tokens = self._tokenize_contributor_name(contributor) editdistances = [ edit_distance(token, string) for token in tokens if abs(len(token) - len(string)) <= minDistance ] if not editdistances: continue distance = min(editdistances) if distance == minDistance: contributorWithMinDistance.append(contributor) elif distance < minDistance: contributorWithMinDistance = [contributor] minDistance = distance if not len(contributorWithMinDistance): return [], len(string) return contributorWithMinDistance, minDistance
def contributors_by_fuzzy_match(self, string): string_in_lowercase = string.lower() # 1. Exact match for fullname, email and irc_nicknames account = self.contributor_by_name( string_in_lowercase) or self.account_by_email( string_in_lowercase) or self.contributor_by_irc_nickname( string_in_lowercase) if account: return [account], 0 # 2. Exact match for email username (before @) accounts = self.contributors_by_email_username(string_in_lowercase) if accounts and len(accounts) == 1: return accounts, 0 # 3. Exact match for first name, last name, and first name + initial combinations such as "Dan B" and "Tim H" accounts = [ contributor for contributor in self.contributors() if string in self._contributor_name_shorthands(contributor) ] if accounts and len(accounts) == 1: return accounts, 0 # 4. Finally, fuzzy-match using edit-distance string = string_in_lowercase contributorWithMinDistance = [] minDistance = len(string) / 2 - 1 for contributor in self.contributors(): tokens = self._tokenize_contributor_name(contributor) editdistances = [ edit_distance(token, string) for token in tokens if abs(len(token) - len(string)) <= minDistance ] if not editdistances: continue distance = min(editdistances) if distance == minDistance: contributorWithMinDistance.append(contributor) elif distance < minDistance: contributorWithMinDistance = [contributor] minDistance = distance if not len(contributorWithMinDistance): return [], len(string) return contributorWithMinDistance, minDistance