def fingerprint(self, phrase): """Return Q-Gram fingerprint. Parameters ---------- phrase : str The string from which to calculate the q-gram fingerprint Returns ------- str The q-gram fingerprint of the phrase Examples -------- >>> qf = QGram() >>> qf.fingerprint('The quick brown fox jumped over the lazy dog.') 'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy' >>> qf.fingerprint('Christopher') 'cherhehrisopphristto' >>> qf.fingerprint('Niall') 'aliallni' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower())) phrase = ''.join(c for c in phrase if c.isalnum()) phrase = self._tokenizer.tokenize(phrase).get_set() phrase = self._joiner.join(sorted(phrase)) return phrase
def fingerprint(self, phrase): """Return Q-Gram fingerprint. Parameters ---------- phrase : str The string from which to calculate the q-gram fingerprint Returns ------- str The q-gram fingerprint of the phrase Examples -------- >>> qf = QGram() >>> qf.fingerprint('The quick brown fox jumped over the lazy dog.') 'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy' >>> qf.fingerprint('Christopher') 'cherhehrisopphristto' >>> qf.fingerprint('Niall') 'aliallni' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ phrase = unicode_normalize('NFKD', phrase.strip().lower()) phrase = ''.join(c for c in phrase if c.isalnum()) phrase = self._tokenizer.tokenize(phrase).get_set() phrase = self._joiner.join(sorted(phrase)) return phrase
def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''): """Return Q-Gram fingerprint. A q-gram fingerprint is a string consisting of all of the unique q-grams in a string, alphabetized & concatenated. This fingerprint is described at :cite:`OpenRefine:2012`. :param str phrase: the string from which to calculate the q-gram fingerprint :param int qval: the length of each q-gram (by default 2) :param str start_stop: the start & stop symbol(s) to concatenate on either end of the phrase, as defined in abydos.util.qgram() :param str joiner: the string that will be placed between each word :returns: the q-gram fingerprint of the phrase :rtype: str >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.') 'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy' >>> qgram_fingerprint('Christopher') 'cherhehrisopphristto' >>> qgram_fingerprint('Niall') 'aliallni' """ phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower())) phrase = ''.join(c for c in phrase if c.isalnum()) phrase = QGrams(phrase, qval, start_stop) phrase = joiner.join(sorted(phrase)) return phrase
def soundex_br(word, max_length=4, zero_pad=True): """Return the SoundexBR encoding of a word. This is based on :cite:`Marcelino:2015`. :param str word: the word to transform :param int max_length: the length of the code returned (defaults to 4) :param bool zero_pad: pad the end of the return value with 0s to achieve a max_length string :returns: the SoundexBR code :rtype: str >>> soundex_br('Oliveira') 'O416' >>> soundex_br('Almeida') 'A453' >>> soundex_br('Barbosa') 'B612' >>> soundex_br('Araújo') 'A620' >>> soundex_br('Gonçalves') 'G524' >>> soundex_br('Goncalves') 'G524' """ _soundex_br_translation = dict( zip((ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), '01230120022455012623010202')) word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' }) if word[:2] == 'WA': first = 'V' elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}: first = 'C' elif word[:1] == 'C' and word[1:2] in {'I', 'E'}: first = 'S' elif word[:1] == 'G' and word[1:2] in {'E', 'I'}: first = 'J' elif word[:1] == 'Y': first = 'I' elif word[:1] == 'H': first = word[1:2] word = word[1:] else: first = word[:1] sdx = first + word[1:].translate(_soundex_br_translation) sdx = _delete_consecutive_repeats(sdx) sdx = sdx.replace('0', '') if zero_pad: sdx += ('0' * max_length) return sdx[:max_length]
def fingerprint(self, phrase): """Return string fingerprint. Parameters ---------- phrase : str The string from which to calculate the fingerprint Returns ------- str The fingerprint of the phrase Example ------- >>> sf = String() >>> sf.fingerprint('The quick brown fox jumped over the lazy dog.') 'brown dog fox jumped lazy over quick the' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ phrase = unicode_normalize('NFKD', phrase.strip().lower()) phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()]) phrase = self._joiner.join(sorted(set(phrase.split()))) return phrase
def encode(self, word): """Return the Roger Root code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Roger Root code Examples -------- >>> pe = RogerRoot() >>> pe.encode('Christopher') '06401' >>> pe.encode('Niall') '02500' >>> pe.encode('Smith') '00310' >>> pe.encode('Schmidt') '06310' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', word.upper()) word = ''.join(c for c in word if c in self._uc_set) code = '' pos = 0 # Do first digit(s) first for num in range(4, 0, -1): if word[:num] in self._init_patterns[num]: code = self._init_patterns[num][word[:num]] pos += num break # Then code subsequent digits while pos < len(word): for num in range(4, 0, -1): # pragma: no branch if word[pos:pos + num] in self._med_patterns[num]: code += self._med_patterns[num][word[pos:pos + num]] pos += num break code = self._delete_consecutive_repeats(code) code = code.replace('*', '') if self._zero_pad: code += '0' * self._max_length return code[:self._max_length]
def fingerprint(self, phrase): """Return string fingerprint. Parameters ---------- phrase : str The string from which to calculate the fingerprint Returns ------- str The fingerprint of the phrase Example ------- >>> sf = String() >>> sf.fingerprint('The quick brown fox jumped over the lazy dog.') 'brown dog fox jumped lazy over quick the' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower())) phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()]) phrase = self._joiner.join(sorted(list(set(phrase.split())))) return phrase
def fingerprint(self, phrase, joiner=' '): """Return string fingerprint. Parameters ---------- phrase : str The string from which to calculate the fingerprint joiner : str The string that will be placed between each word Returns ------- str The fingerprint of the phrase Example ------- >>> sf = String() >>> sf.fingerprint('The quick brown fox jumped over the lazy dog.') 'brown dog fox jumped lazy over quick the' """ phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower())) phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()]) phrase = joiner.join(sorted(list(set(phrase.split())))) return phrase
def encode(self, word, max_length=5, zero_pad=True): """Return the Roger Root code for a word. Parameters ---------- word : str The word to transform max_length : int The maximum length (default 5) of the code to return zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string Returns ------- str The Roger Root code Examples -------- >>> roger_root('Christopher') '06401' >>> roger_root('Niall') '02500' >>> roger_root('Smith') '00310' >>> roger_root('Schmidt') '06310' """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) code = '' pos = 0 # Do first digit(s) first for num in range(4, 0, -1): if word[:num] in self._init_patterns[num]: code = self._init_patterns[num][word[:num]] pos += num break # Then code subsequent digits while pos < len(word): for num in range(4, 0, -1): # pragma: no branch if word[pos:pos + num] in self._med_patterns[num]: code += self._med_patterns[num][word[pos:pos + num]] pos += num break code = self._delete_consecutive_repeats(code) code = code.replace('*', '') if zero_pad: code += '0' * max_length return code[:max_length]
def encode(self, word): """Return the SoundD code. Parameters ---------- word : str The word to transform Returns ------- str The SoundD code Examples -------- >>> pe = SoundD() >>> pe.encode('Gough') '2000' >>> pe.encode('pneuma') '5500' >>> pe.encode('knight') '5300' >>> pe.encode('trice') '3620' >>> pe.encode('judge') '2200' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}: word = word[1:] elif word[:1] == 'X': word = 'S' + word[1:] elif word[:2] == 'WH': word = 'W' + word[2:] word = ( word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0') ) word = word.translate(self._trans) word = self._delete_consecutive_repeats(word) word = word.replace('0', '') if self._max_length != -1: if len(word) < self._max_length: word += '0' * (self._max_length - len(word)) else: word = word[: self._max_length] return word
def normalize_column(name): name = name.replace('_', ' ') name = text_normalize(name) name = name.replace(' ', '_') name = unicode_normalize('NFKC', name) # column names can be 63 *bytes* max in postgresql while len(name.encode('utf-8')) >= 64: name = name[:len(name) - 1] return name
def encode(self, word, max_length=4): """Return the SoundD code. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to 4) Returns ------- str The SoundD code Examples -------- >>> sound_d('Gough') '2000' >>> sound_d('pneuma') '5500' >>> sound_d('knight') '5300' >>> sound_d('trice') '3620' >>> sound_d('judge') '2200' """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}: word = word[1:] elif word[:1] == 'X': word = 'S' + word[1:] elif word[:2] == 'WH': word = 'W' + word[2:] word = (word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')) word = word.translate(self._trans) word = self._delete_consecutive_repeats(word) word = word.replace('0', '') if max_length != -1: if len(word) < max_length: word += '0' * (max_length - len(word)) else: word = word[:max_length] return word
def sound_d(word, max_length=4): """Return the SoundD code. SoundD is defined in :cite:`Varol:2012`. :param str word: the word to transform :param int max_length: the length of the code returned (defaults to 4) :returns: the SoundD code :rtype: str >>> sound_d('Gough') '2000' >>> sound_d('pneuma') '5500' >>> sound_d('knight') '5300' >>> sound_d('trice') '3620' >>> sound_d('judge') '2200' """ _ref_soundd_translation = dict( zip((ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), '01230120022455012623010202')) word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' }) if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}: word = word[1:] elif word[:1] == 'X': word = 'S' + word[1:] elif word[:2] == 'WH': word = 'W' + word[2:] word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0') word = word.translate(_ref_soundd_translation) word = _delete_consecutive_repeats(word) word = word.replace('0', '') if max_length != -1: if len(word) < max_length: word += '0' * (max_length - len(word)) else: word = word[:max_length] return word
def encode(self, word, alphabetic=False): """Return the Wåhlin code for a word. Parameters ---------- word : str The word to transform alphabetic : bool If True, the encoder will apply its alphabetic form (.encode_alpha rather than .encode) Returns ------- str The Wåhlin code value Examples -------- >>> pe = Waahlin() >>> pe.encode('Christopher') 'KRISTOFER' >>> pe.encode('Niall') 'NJALL' >>> pe.encode('Smith') 'SMITH' >>> pe.encode('Schmidt') '*MIDT' .. versionadded:: 0.4.0 """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFC', text_type(word.upper())) word = word.replace('ß', 'SS') if not word: return '' if self._encoder is None: code = '' while word: part, word = self._encode_next(word) code += part return code code, word = self._encode_next(word) return code + ( self._encoder.encode_alpha(word) if alphabetic else self._encoder.encode(word) )
def encode(self, word: str) -> str: """Return the Russell Index (integer output) of a word. Parameters ---------- word : str The word to transform Returns ------- str The Russell Index value Examples -------- >>> pe = RussellIndex() >>> pe.encode('Christopher') '3813428' >>> pe.encode('Niall') '715' >>> pe.encode('Smith') '3614' >>> pe.encode('Schmidt') '3614' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class .. versionchanged:: 0.6.0 Made return a str """ word = unicode_normalize('NFKD', word.upper()) word = word.replace('GH', '') # discard gh (rule 3) word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) # translate according to Russell's mapping word = ''.join(c for c in word if c in self._uc_set) sdx = word.translate(self._trans) # remove any 1s after the first occurrence one = sdx.find('1') + 1 if one: sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') # remove repeating characters sdx = self._delete_consecutive_repeats(sdx) return sdx
def encode(self, word, max_length=-1, zero_pad=False, retain_vowels=False): """Return the Refined Soundex code for a word. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to unlimited) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string retain_vowels : bool Retain vowels (as 0) in the resulting code Returns ------- str The Refined Soundex value Examples -------- >>> pe = RefinedSoundex() >>> pe.encode('Christopher') 'C393619' >>> pe.encode('Niall') 'N87' >>> pe.encode('Smith') 'S386' >>> pe.encode('Schmidt') 'S386' """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) # apply the Soundex algorithm sdx = word[:1] + word.translate(self._trans) sdx = self._delete_consecutive_repeats(sdx) if not retain_vowels: sdx = sdx.replace('0', '') # Delete vowels, H, W, Y if max_length > 0: if zero_pad: sdx += '0' * max_length sdx = sdx[:max_length] return sdx
def normalize_text(text: str) -> str: """Performs text normalization using regex patterns """ text = unicode_normalize("NFC", text) text = text.lower() text = re.sub("```(.|\n|\r)*?```", "", text) text = re.sub(r"\s+", " ", text) text = re.sub("[-_:/]", " ", text) text = re.sub(r"http\S+", "", text) text = re.sub(r"\.+", ".", text) text = re.sub("[?!;…]", ".", text) text = text.replace("\n", ".") return text
def encode(self, word): """Return the Russell Index (integer output) of a word. Parameters ---------- word : str The word to transform Returns ------- int The Russell Index value Examples -------- >>> pe = RussellIndex() >>> pe.encode('Christopher') 3813428 >>> pe.encode('Niall') 715 >>> pe.encode('Smith') 3614 >>> pe.encode('Schmidt') 3614 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('GH', '') # discard gh (rule 3) word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) # translate according to Russell's mapping word = ''.join(c for c in word if c in self._uc_set) sdx = word.translate(self._trans) # remove any 1s after the first occurrence one = sdx.find('1') + 1 if one: sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') # remove repeating characters sdx = self._delete_consecutive_repeats(sdx) # return as an int return int(sdx) if sdx else float('NaN')
def encode(self, word, alphabetic=False): """Return the Wåhlin code for a word. Parameters ---------- word : str The word to transform alphabetic : bool If True, the encoder will apply its alphabetic form (.encode_alpha rather than .encode) Returns ------- str The Wåhlin code value Examples -------- >>> pe = Waahlin() >>> pe.encode('Christopher') 'KRISTOFER' >>> pe.encode('Niall') 'NJALL' >>> pe.encode('Smith') 'SMITH' >>> pe.encode('Schmidt') '*MIDT' .. versionadded:: 0.4.0 """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFC', word.upper()) if not word: return '' if self._encoder is None: code = '' while word: part, word = self._encode_next(word) code += part return code code, word = self._encode_next(word) return code + ( self._encoder.encode_alpha(word) if alphabetic else self._encoder.encode(word) )
def _cost(s, t): if s[-1:] == '-': if s[-2:] == '--': return self._weights[6] else: return self._weights[7] elif t[-1:] == '-': if t[-2:] == '--': return self._weights[6] else: return self._weights[7] s = unicode_normalize('NFC', s)[-1:] t = unicode_normalize('NFC', t)[-1:] if s == t: if s in self._consonants or s in self._glides: return self._weights[0] else: return self._weights[1] if ''.join(sorted([s, t])) in {'iy', 'uw'}: return self._weights[2] sd = unicode_normalize('NFKD', s) td = unicode_normalize('NFKD', t) if sd[0] == td[0] and s in self._vowels: return self._weights[2] if sd[0] in self._vowels and td[0] in self._vowels: return self._weights[3] if sd[0] in self._consonants and td[0] in self._consonants: return self._weights[4] return self._weights[5]
def encode(self, word): """Return the Refined Soundex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Refined Soundex value Examples -------- >>> pe = RefinedSoundex() >>> pe.encode('Christopher') 'C93619' >>> pe.encode('Niall') 'N7' >>> pe.encode('Smith') 'S86' >>> pe.encode('Schmidt') 'S386' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) # apply the Soundex algorithm sdx = word[:1] + word[1:].translate(self._trans) sdx = self._delete_consecutive_repeats(sdx) if not self._retain_vowels: sdx = sdx.replace('0', '') # Delete vowels, H, W, Y if self._max_length > 0: if self._zero_pad: sdx += '0' * self._max_length sdx = sdx[: self._max_length] return sdx
def encode(self, word): """Return the PhoneticSpanish coding of word. Parameters ---------- word : str The word to transform Returns ------- str The PhoneticSpanish code Examples -------- >>> pe = PhoneticSpanish() >>> pe.encode('Perez') '094' >>> pe.encode('Martinez') '69364' >>> pe.encode('Gutierrez') '83994' >>> pe.encode('Santiago') '4638' >>> pe.encode('Nicolás') '6454' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, and decompose, filter to A-Z minus vowels & W word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._uc_set) # merge repeated Ls & Rs word = word.replace('LL', 'L') word = word.replace('R', 'R') # apply the Soundex algorithm sdx = word.translate(self._trans) if self._max_length > 0: sdx = (sdx + ('0' * self._max_length))[: self._max_length] return sdx
def encode(self, word): """Return the Refined Soundex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Refined Soundex value Examples -------- >>> pe = RefinedSoundex() >>> pe.encode('Christopher') 'C93619' >>> pe.encode('Niall') 'N7' >>> pe.encode('Smith') 'S86' >>> pe.encode('Schmidt') 'S386' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) # apply the Soundex algorithm sdx = word[:1] + word[1:].translate(self._trans) sdx = self._delete_consecutive_repeats(sdx) if not self._retain_vowels: sdx = sdx.replace('0', '') # Delete vowels, H, W, Y if self._max_length > 0: if self._zero_pad: sdx += '0' * self._max_length sdx = sdx[:self._max_length] return sdx
def refined_soundex(word, max_length=-1, zero_pad=False, retain_vowels=False): """Return the Refined Soundex code for a word. This is Soundex, but with more character classes. It was defined at :cite:`Boyce:1998`. :param word: the word to transform :param max_length: the length of the code returned (defaults to unlimited) :param zero_pad: pad the end of the return value with 0s to achieve a max_length string :param retain_vowels: retain vowels (as 0) in the resulting code :returns: the Refined Soundex value :rtype: str >>> refined_soundex('Christopher') 'C393619' >>> refined_soundex('Niall') 'N87' >>> refined_soundex('Smith') 'S386' >>> refined_soundex('Schmidt') 'S386' """ _ref_soundex_translation = dict(zip((ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), '01360240043788015936020505')) # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}) # apply the Soundex algorithm sdx = word[:1] + word.translate(_ref_soundex_translation) sdx = _delete_consecutive_repeats(sdx) if not retain_vowels: sdx = sdx.replace('0', '') # Delete vowels, H, W, Y if max_length > 0: if zero_pad: sdx += ('0' * max_length) sdx = sdx[:max_length] return sdx
def encode(self, word): """Return the FONEM code of a word. Parameters ---------- word : str The word to transform Returns ------- str The FONEM code Examples -------- >>> pe = FONEM() >>> pe.encode('Marchand') 'MARCHEN' >>> pe.encode('Beaulieu') 'BOLIEU' >>> pe.encode('Beaumont') 'BOMON' >>> pe.encode('Legrand') 'LEGREN' >>> pe.encode('Pelletier') 'PELETIER' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # normalize, upper-case, and filter non-French letters word = unicode_normalize('NFKD', text_type(word.upper())) word = word.translate({198: 'AE', 338: 'OE'}) word = ''.join(c for c in word if c in self._uc_set) for rule in self._rule_order: regex, repl = self._rule_table[rule] if isinstance(regex, text_type): word = word.replace(regex, repl) else: word = regex.sub(repl, word) return word
def encode(self, word): """Return the Statistics Canada code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Statistics Canada name code value Examples -------- >>> pe = StatisticsCanada() >>> pe.encode('Christopher') 'CHRS' >>> pe.encode('Niall') 'NL' >>> pe.encode('Smith') 'SMTH' >>> pe.encode('Schmidt') 'SCHM' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) if not word: return '' code = word[1:] for vowel in self._uc_vy_set: code = code.replace(vowel, '') code = word[0] + code code = self._delete_consecutive_repeats(code) code = code.replace(' ', '') return code[:self._max_length]
def encode(self, word): """Return the Statistics Canada code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Statistics Canada name code value Examples -------- >>> pe = StatisticsCanada() >>> pe.encode('Christopher') 'CHRS' >>> pe.encode('Niall') 'NL' >>> pe.encode('Smith') 'SMTH' >>> pe.encode('Schmidt') 'SCHM' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) if not word: return '' code = word[1:] for vowel in self._uc_vy_set: code = code.replace(vowel, '') code = word[0] + code code = self._delete_consecutive_repeats(code) code = code.replace(' ', '') return code[: self._max_length]
def fingerprint(self, word): """Return the skeleton key. Parameters ---------- word : str The word to transform into its skeleton key Returns ------- str The skeleton key Examples -------- >>> sk = SkeletonKey() >>> sk.fingerprint('The quick brown fox jumped over the lazy dog.') 'THQCKBRWNFXJMPDVLZYGEUIOA' >>> sk.fingerprint('Christopher') 'CHRSTPIOE' >>> sk.fingerprint('Niall') 'NLIA' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', word.upper()) word = ''.join(c for c in word if c in self._letters) start = word[0:1] consonant_part = '' vowel_part = '' # add consonants & vowels to to separate strings # (omitting the first char & duplicates) for char in word[1:]: if char != start: if char in self._vowels: if char not in vowel_part: vowel_part += char elif char not in consonant_part: consonant_part += char # return the first char followed by consonants followed by vowels return start + consonant_part + vowel_part
def encode(self, word): """Return the FONEM code of a word. Parameters ---------- word : str The word to transform Returns ------- str The FONEM code Examples -------- >>> pe = FONEM() >>> pe.encode('Marchand') 'MARCHEN' >>> pe.encode('Beaulieu') 'BOLIEU' >>> pe.encode('Beaumont') 'BOMON' >>> pe.encode('Legrand') 'LEGREN' >>> pe.encode('Pelletier') 'PELETIER' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # normalize, upper-case, and filter non-French letters word = unicode_normalize('NFKD', text_type(word.upper())) word = word.translate({198: 'AE', 338: 'OE'}) word = ''.join(c for c in word if c in self._uc_set) for rule in self._rule_order: regex, repl = self._rule_table[rule] if isinstance(regex, text_type): word = word.replace(regex, repl) else: word = regex.sub(repl, word) return word
def russell_index(word): """Return the Russell Index (integer output) of a word. This follows Robert C. Russell's Index algorithm, as described in :cite:`Russell:1917`. :param str word: the word to transform :returns: the Russell Index value :rtype: int >>> russell_index('Christopher') 3813428 >>> russell_index('Niall') 715 >>> russell_index('Smith') 3614 >>> russell_index('Schmidt') 3614 """ _russell_translation = dict( zip((ord(_) for _ in 'ABCDEFGIKLMNOPQRSTUVXYZ'), '12341231356712383412313')) word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('GH', '') # discard gh (rule 3) word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) # translate according to Russell's mapping word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z' }) sdx = word.translate(_russell_translation) # remove any 1s after the first occurrence one = sdx.find('1') + 1 if one: sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') # remove repeating characters sdx = _delete_consecutive_repeats(sdx) # return as an int return int(sdx) if sdx else float('NaN')
def encode(self, word, max_length=-1): """Return the PhoneticSpanish coding of word. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to unlimited) Returns ------- str The PhoneticSpanish code Examples -------- >>> pe = PhoneticSpanish() >>> pe.encode('Perez') '094' >>> pe.encode('Martinez') '69364' >>> pe.encode('Gutierrez') '83994' >>> pe.encode('Santiago') '4638' >>> pe.encode('Nicolás') '6454' """ # uppercase, normalize, and decompose, filter to A-Z minus vowels & W word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._uc_set) # merge repeated Ls & Rs word = word.replace('LL', 'L') word = word.replace('R', 'R') # apply the Soundex algorithm sdx = word.translate(self._trans) if max_length > 0: sdx = (sdx + ('0' * max_length))[:max_length] return sdx
def encode(self, word, max_length=4, zero_pad=True): """Return the Lein code for a word. Parameters ---------- word : str The word to transform max_length : int The length of the code returned (defaults to 4) zero_pad : bool Pad the end of the return value with 0s to achieve a max_length string Returns ------- str The Lein code Examples -------- >>> pe = Lein() >>> pe.encode('Christopher') 'C351' >>> pe.encode('Niall') 'N300' >>> pe.encode('Smith') 'S210' >>> pe.encode('Schmidt') 'S521' """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) code = word[:1] # Rule 1 word = word[1:].translate(self._del_trans) # Rule 2 word = self._delete_consecutive_repeats(word) # Rule 3 code += word.translate(self._trans) # Rule 4 if zero_pad: code += '0' * max_length # Rule 4 return code[:max_length]
def statistics_canada(word, max_length=4): """Return the Statistics Canada code for a word. The original description of this algorithm could not be located, and may only have been specified in an unpublished TR. The coding does not appear to be in use by Statistics Canada any longer. In its place, this is an implementation of the "Census modified Statistics Canada name coding procedure". The modified version of this algorithm is described in Appendix B of :cite:`Moore:1977`. :param str word: the word to transform :param int max_length: the maximum length (default 4) of the code to return :returns: the Statistics Canada name code value :rtype: str >>> statistics_canada('Christopher') 'CHRS' >>> statistics_canada('Niall') 'NL' >>> statistics_canada('Smith') 'SMTH' >>> statistics_canada('Schmidt') 'SCHM' """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' }) if not word: return '' code = word[1:] for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}: code = code.replace(vowel, '') code = word[0] + code code = _delete_consecutive_repeats(code) code = code.replace(' ', '') return code[:max_length]
def encode(self, word): """Return the Russell Index (integer output) of a word. Parameters ---------- word : str The word to transform Returns ------- int The Russell Index value Examples -------- >>> pe = RussellIndex() >>> pe.encode('Christopher') 3813428 >>> pe.encode('Niall') 715 >>> pe.encode('Smith') 3614 >>> pe.encode('Schmidt') 3614 """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('GH', '') # discard gh (rule 3) word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) # translate according to Russell's mapping word = ''.join(c for c in word if c in self._uc_set) sdx = word.translate(self._trans) # remove any 1s after the first occurrence one = sdx.find('1') + 1 if one: sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') # remove repeating characters sdx = self._delete_consecutive_repeats(sdx) # return as an int return int(sdx) if sdx else float('NaN')
def encode(self, word): """Return the LEIN code for a word. Parameters ---------- word : str The word to transform Returns ------- str The LEIN code Examples -------- >>> pe = LEIN() >>> pe.encode('Christopher') 'C351' >>> pe.encode('Niall') 'N300' >>> pe.encode('Smith') 'S210' >>> pe.encode('Schmidt') 'S521' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) code = word[:1] # Rule 1 word = word[1:].translate(self._del_trans) # Rule 2 word = self._delete_consecutive_repeats(word) # Rule 3 code += word.translate(self._trans) # Rule 4 if self._zero_pad: code += '0' * self._max_length # Rule 4 return code[:self._max_length]
def fingerprint(self, word): """Return the omission key. Parameters ---------- word : str The word to transform into its omission key Returns ------- str The omission key Examples -------- >>> ok = OmissionKey() >>> ok.fingerprint('The quick brown fox jumped over the lazy dog.') 'JKQXZVWYBFMGPDHCLNTREUIOA' >>> ok.fingerprint('Christopher') 'PHCTSRIOE' >>> ok.fingerprint('Niall') 'LNIA' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._letters) key = '' # add consonants in order supplied by _consonants (no duplicates) for char in self._consonants: if char in word: key += char # add vowels in order they appeared in the word (no duplicates) for char in word: if char not in self._consonants and char not in key: key += char return key
def phonem(word): """Return the Phonem code for a word. Phonem is defined in :cite:`Wilde:1988`. This version is based on the Perl implementation documented at :cite:`Wilz:2005`. It includes some enhancements presented in the Java port at :cite:`dcm4che:2011`. Phonem is intended chiefly for German names/words. :param str word: the word to transform :returns: the Phonem value :rtype: str >>> phonem('Christopher') 'CRYSDOVR' >>> phonem('Niall') 'NYAL' >>> phonem('Smith') 'SMYD' >>> phonem('Schmidt') 'CMYD' """ _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'), ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'), ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'), ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'), ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'), ('AU', 'A§'), ('OU', '§')) _phonem_translation = dict( zip((ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), 'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ')) word = unicode_normalize('NFC', text_type(word.upper())) for i, j in _phonem_substitutions: word = word.replace(i, j) word = word.translate(_phonem_translation) return ''.join( c for c in _delete_consecutive_repeats(word) if c in { 'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', 'U', 'V', 'W', 'X', 'Y', 'Ö' })
def fingerprint(self, word: str) -> str: """Return the omission key. Parameters ---------- word : str The word to transform into its omission key Returns ------- str The omission key Examples -------- >>> ok = OmissionKey() >>> ok.fingerprint('The quick brown fox jumped over the lazy dog.') 'JKQXZVWYBFMGPDHCLNTREUIOA' >>> ok.fingerprint('Christopher') 'PHCTSRIOE' >>> ok.fingerprint('Niall') 'LNIA' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', word.upper()) word = ''.join(c for c in word if c in self._letters) key = '' # add consonants in order supplied by _consonants (no duplicates) for char in self._consonants: if char in word: key += char # add vowels in order they appeared in the word (no duplicates) for char in word: if char not in self._consonants and char not in key: key += char return key
def dist_abs(self, src, tar): """Return the Editex distance between two strings. Parameters ---------- src : str Source string for comparison tar : str Target string for comparison Returns ------- int Editex distance Examples -------- >>> cmp = Editex() >>> cmp.dist_abs('cat', 'hat') 2 >>> cmp.dist_abs('Niall', 'Neil') 2 >>> cmp.dist_abs('aluminum', 'Catalan') 12 >>> cmp.dist_abs('ATCG', 'TAGC') 6 .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ match_cost, group_cost, mismatch_cost = self._cost def r_cost(ch1, ch2): """Return r(a,b) according to Zobel & Dart's definition. Parameters ---------- ch1 : str The first character to compare ch2 : str The second character to compare Returns ------- int r(a,b) according to Zobel & Dart's definition .. versionadded:: 0.1.0 """ if ch1 == ch2: return match_cost if ch1 in self._all_letters and ch2 in self._all_letters: for group in self._letter_groups: if ch1 in group and ch2 in group: return group_cost return mismatch_cost def d_cost(ch1, ch2): """Return d(a,b) according to Zobel & Dart's definition. Parameters ---------- ch1 : str The first character to compare ch2 : str The second character to compare Returns ------- int d(a,b) according to Zobel & Dart's definition .. versionadded:: 0.1.0 """ if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'): return group_cost return r_cost(ch1, ch2) # convert both src & tar to NFKD normalized unicode src = unicode_normalize('NFKD', text_type(src.upper())) tar = unicode_normalize('NFKD', text_type(tar.upper())) # convert ß to SS (for Python2) src = src.replace('ß', 'SS') tar = tar.replace('ß', 'SS') src_len = len(src) tar_len = len(tar) max_len = max(src_len, tar_len) if src == tar: return 0.0 if not src: return sum( mismatch_cost * self._taper(pos, max_len) for pos in range(tar_len) ) if not tar: return sum( mismatch_cost * self._taper(pos, max_len) for pos in range(src_len) ) d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float) src = ' ' + src tar = ' ' + tar if not self._local: for i in range(1, src_len + 1): d_mat[i, 0] = d_mat[i - 1, 0] + d_cost( src[i - 1], src[i] ) * self._taper(i, max_len) for j in range(1, tar_len + 1): d_mat[0, j] = d_mat[0, j - 1] + d_cost( tar[j - 1], tar[j] ) * self._taper(j, max_len) for i in range(1, src_len + 1): for j in range(1, tar_len + 1): d_mat[i, j] = min( d_mat[i - 1, j] + d_cost(src[i - 1], src[i]) * self._taper(max(i, j), max_len), d_mat[i, j - 1] + d_cost(tar[j - 1], tar[j]) * self._taper(max(i, j), max_len), d_mat[i - 1, j - 1] + r_cost(src[i], tar[j]) * self._taper(max(i, j), max_len), ) if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]: return int(d_mat[src_len, tar_len]) else: return d_mat[src_len, tar_len]
def encode(self, word): """Return the Roger Root code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Roger Root code Examples -------- >>> pe = RogerRoot() >>> pe.encode('Christopher') '06401' >>> pe.encode('Niall') '02500' >>> pe.encode('Smith') '00310' >>> pe.encode('Schmidt') '06310' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) code = '' pos = 0 # Do first digit(s) first for num in range(4, 0, -1): if word[:num] in self._init_patterns[num]: code = self._init_patterns[num][word[:num]] pos += num break # Then code subsequent digits while pos < len(word): for num in range(4, 0, -1): # pragma: no branch if word[pos : pos + num] in self._med_patterns[num]: code += self._med_patterns[num][word[pos : pos + num]] pos += num break code = self._delete_consecutive_repeats(code) code = code.replace('*', '') if self._zero_pad: code += '0' * self._max_length return code[: self._max_length]
def encode(self, word): """Return the IBM Alpha Search Inquiry System code for a word. A collection is necessary as the return type since there can be multiple values for a single word. But the collection must be ordered since the first value is the primary coding. Parameters ---------- word : str The word to transform Returns ------- tuple The Alpha-SIS value Examples -------- >>> pe = AlphaSIS() >>> pe.encode('Christopher') ('06401840000000', '07040184000000', '04018400000000') >>> pe.encode('Niall') ('02500000000000',) >>> pe.encode('Smith') ('03100000000000',) >>> pe.encode('Schmidt') ('06310000000000',) .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ alpha = [''] pos = 0 word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) # Do special processing for initial substrings for k in self._alpha_sis_initials_order: if word.startswith(k): alpha[0] += self._alpha_sis_initials[k] pos += len(k) break # Add a '0' if alpha is still empty if not alpha[0]: alpha[0] += '0' # Whether or not any special initial codes were encoded, iterate # through the length of the word in the main encoding loop while pos < len(word): orig_pos = pos for k in self._alpha_sis_basic_order: if word[pos:].startswith(k): if isinstance(self._alpha_sis_basic[k], tuple): newalpha = [] for i in range(len(self._alpha_sis_basic[k])): newalpha += [ _ + self._alpha_sis_basic[k][i] for _ in alpha ] alpha = newalpha else: alpha = [_ + self._alpha_sis_basic[k] for _ in alpha] pos += len(k) break if pos == orig_pos: alpha = [_ + '_' for _ in alpha] pos += 1 # Trim doublets and placeholders for i in range(len(alpha)): pos = 1 while pos < len(alpha[i]): if alpha[i][pos] == alpha[i][pos - 1]: alpha[i] = alpha[i][:pos] + alpha[i][pos + 1 :] pos += 1 alpha = (_.replace('_', '') for _ in alpha) # Trim codes and return tuple alpha = ( (_ + ('0' * self._max_length))[: self._max_length] for _ in alpha ) return tuple(alpha)
def encode(self, fname): """Calculate the PSHP Soundex/Viewex Coding of a first name. Parameters ---------- fname : str The first name to encode Returns ------- str The PSHP Soundex/Viewex Coding Examples -------- >>> pe = PSHPSoundexFirst() >>> pe.encode('Smith') 'S530' >>> pe.encode('Waters') 'W352' >>> pe.encode('James') 'J700' >>> pe.encode('Schmidt') 'S500' >>> pe.encode('Ashcroft') 'A220' >>> pe.encode('John') 'J500' >>> pe.encode('Colin') 'K400' >>> pe.encode('Niall') 'N400' >>> pe.encode('Sally') 'S400' >>> pe.encode('Jane') 'J500' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ fname = unicode_normalize('NFKD', text_type(fname.upper())) fname = fname.replace('ß', 'SS') fname = ''.join(c for c in fname if c in self._uc_set) # special rules if fname == 'JAMES': code = 'J7' elif fname == 'PAT': code = 'P7' else: # A. Prefix treatment if fname[:2] in {'GE', 'GI', 'GY'}: fname = 'J' + fname[1:] elif fname[:2] in {'CE', 'CI', 'CY'}: fname = 'S' + fname[1:] elif fname[:3] == 'CHR': fname = 'K' + fname[1:] elif fname[:1] == 'C' and fname[:2] != 'CH': fname = 'K' + fname[1:] if fname[:2] == 'KN': fname = 'N' + fname[1:] elif fname[:2] == 'PH': fname = 'F' + fname[1:] elif fname[:3] in {'WIE', 'WEI'}: fname = 'V' + fname[1:] if self._german and fname[:1] in {'W', 'M', 'Y', 'Z'}: fname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[ fname[0] ] + fname[1:] code = fname[:1] # B. Soundex coding # code for Y unspecified, but presumably is 0 fname = fname.translate(self._trans) fname = self._delete_consecutive_repeats(fname) code += fname[1:] syl_ptr = code.find('0') syl2_ptr = code[syl_ptr + 1 :].find('0') if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1: code = code[: syl_ptr + 2] code = code.replace('0', '') # rule 1 if self._max_length != -1: if len(code) < self._max_length: code += '0' * (self._max_length - len(code)) else: code = code[: self._max_length] return code
def encode(self, word): """Return the Phonix code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Phonix value Examples -------- >>> pe = Phonix() >>> pe.encode('Christopher') 'K683' >>> pe.encode('Niall') 'N400' >>> pe.encode('Smith') 'S530' >>> pe.encode('Schmidt') 'S530' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ def _start_repl(word, src, tar, post=None): """Replace src with tar at the start of word. Parameters ---------- word : str The word to modify src : str Substring to match tar : str Substring to substitute post : set Following characters Returns ------- str Modified string .. versionadded:: 0.1.0 """ if post: for i in post: if word.startswith(src + i): return tar + word[len(src) :] elif word.startswith(src): return tar + word[len(src) :] return word def _end_repl(word, src, tar, pre=None): """Replace src with tar at the end of word. Parameters ---------- word : str The word to modify src : str Substring to match tar : str Substring to substitute pre : set Preceding characters Returns ------- str Modified string .. versionadded:: 0.1.0 """ if pre: for i in pre: if word.endswith(i + src): return word[: -len(src)] + tar elif word.endswith(src): return word[: -len(src)] + tar return word def _mid_repl(word, src, tar, pre=None, post=None): """Replace src with tar in the middle of word. Parameters ---------- word : str The word to modify src : str Substring to match tar : str Substring to substitute pre : set Preceding characters post : set Following characters Returns ------- str Modified string .. versionadded:: 0.1.0 """ if pre or post: if not pre: return word[0] + _all_repl(word[1:], src, tar, pre, post) elif not post: return _all_repl(word[:-1], src, tar, pre, post) + word[-1] return _all_repl(word, src, tar, pre, post) return ( word[0] + _all_repl(word[1:-1], src, tar, pre, post) + word[-1] ) def _all_repl(word, src, tar, pre=None, post=None): """Replace src with tar anywhere in word. Parameters ---------- word : str The word to modify src : str Substring to match tar : str Substring to substitute pre : set Preceding characters post : set Following characters Returns ------- str Modified string .. versionadded:: 0.1.0 """ if pre or post: if post: post = post else: post = frozenset(('',)) if pre: pre = pre else: pre = frozenset(('',)) for i, j in ((i, j) for i in pre for j in post): word = word.replace(i + src + j, i + tar + j) return word else: return word.replace(src, tar) repl_at = (_start_repl, _end_repl, _mid_repl, _all_repl) sdx = '' word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) if word: for trans in self._substitutions: word = repl_at[trans[0]](word, *trans[1:]) if word[0] in self._uc_vy_set: sdx = 'v' + word[1:].translate(self._trans) else: sdx = word[0] + word[1:].translate(self._trans) sdx = self._delete_consecutive_repeats(sdx) sdx = sdx.replace('0', '') if self._zero_pad: sdx += '0' * self._max_length if not sdx: sdx = '0' return sdx[: self._max_length]
def encode(self, word): """Return the Phonex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Phonex value Examples -------- >>> pe = Phonex() >>> pe.encode('Christopher') 'C623' >>> pe.encode('Niall') 'N400' >>> pe.encode('Schmidt') 'S253' >>> pe.encode('Smith') 'S530' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ name = unicode_normalize('NFKD', text_type(word.upper())) name = name.replace('ß', 'SS') name_code = last = '' # Deletions effected by replacing with next letter which # will be ignored due to duplicate handling of Soundex code. # This is faster than 'moving' all subsequent letters. # Remove any trailing Ss while name[-1:] == 'S': name = name[:-1] # Phonetic equivalents of first 2 characters # Works since duplicate letters are ignored if name[:2] == 'KN': name = 'N' + name[2:] # KN.. == N.. elif name[:2] == 'PH': name = 'F' + name[2:] # PH.. == F.. (H ignored anyway) elif name[:2] == 'WR': name = 'R' + name[2:] # WR.. == R.. if name: # Special case, ignore H first letter (subsequent Hs ignored # anyway) # Works since duplicate letters are ignored if name[0] == 'H': name = name[1:] if name: # Phonetic equivalents of first character if name[0] in self._uc_vy_set: name = 'A' + name[1:] elif name[0] in {'B', 'P'}: name = 'B' + name[1:] elif name[0] in {'V', 'F'}: name = 'F' + name[1:] elif name[0] in {'C', 'K', 'Q'}: name = 'C' + name[1:] elif name[0] in {'G', 'J'}: name = 'G' + name[1:] elif name[0] in {'S', 'Z'}: name = 'S' + name[1:] name_code = last = name[0] # Modified Soundex code for i in range(1, len(name)): code = '0' if name[i] in {'B', 'F', 'P', 'V'}: code = '1' elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}: code = '2' elif name[i] in {'D', 'T'}: if name[i + 1 : i + 2] != 'C': code = '3' elif name[i] == 'L': if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len( name ): code = '4' elif name[i] in {'M', 'N'}: if name[i + 1 : i + 2] in {'D', 'G'}: name = name[: i + 1] + name[i] + name[i + 2 :] code = '5' elif name[i] == 'R': if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len( name ): code = '6' if code != last and code != '0' and i != 0: name_code += code last = name_code[-1] if self._zero_pad: name_code += '0' * self._max_length if not name_code: name_code = '0' return name_code[: self._max_length]
def encode(self, word): """Return the SoundexBR encoding of a word. Parameters ---------- word : str The word to transform Returns ------- str The SoundexBR code Examples -------- >>> pe = SoundexBR() >>> pe.encode('Oliveira') 'O416' >>> pe.encode('Almeida') 'A453' >>> pe.encode('Barbosa') 'B612' >>> pe.encode('Araújo') 'A620' >>> pe.encode('Gonçalves') 'G524' >>> pe.encode('Goncalves') 'G524' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._uc_set) if word[:2] == 'WA': first = 'V' elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}: first = 'C' elif word[:1] == 'C' and word[1:2] in {'I', 'E'}: first = 'S' elif word[:1] == 'G' and word[1:2] in {'E', 'I'}: first = 'J' elif word[:1] == 'Y': first = 'I' elif word[:1] == 'H': first = word[1:2] word = word[1:] else: first = word[:1] sdx = first + word[1:].translate(self._trans) sdx = self._delete_consecutive_repeats(sdx) sdx = sdx.replace('0', '') if self._zero_pad: sdx += '0' * self._max_length return sdx[: self._max_length]
def encode(self, lname): """Calculate the PSHP Soundex/Viewex Coding of a last name. Parameters ---------- lname : str The last name to encode Returns ------- str The PSHP Soundex/Viewex Coding Examples -------- >>> pe = PSHPSoundexLast() >>> pe.encode('Smith') 'S530' >>> pe.encode('Waters') 'W350' >>> pe.encode('James') 'J500' >>> pe.encode('Schmidt') 'S530' >>> pe.encode('Ashcroft') 'A225' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ lname = unicode_normalize('NFKD', text_type(lname.upper())) lname = lname.replace('ß', 'SS') lname = ''.join(c for c in lname if c in self._uc_set) # A. Prefix treatment if lname[:3] == 'VON' or lname[:3] == 'VAN': lname = lname[3:].strip() # The rule implemented below says "MC, MAC become 1". I believe it # meant to say they become M except in German data (where superscripted # 1 indicates "except in German data"). It doesn't make sense for them # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately, # both articles have this error(?). if not self._german: if lname[:3] == 'MAC': lname = 'M' + lname[3:] elif lname[:2] == 'MC': lname = 'M' + lname[2:] # The non-German-only rule to strip ' is unnecessary due to filtering if lname[:1] in {'E', 'I', 'O', 'U'}: lname = 'A' + lname[1:] elif lname[:2] in {'GE', 'GI', 'GY'}: lname = 'J' + lname[1:] elif lname[:2] in {'CE', 'CI', 'CY'}: lname = 'S' + lname[1:] elif lname[:3] == 'CHR': lname = 'K' + lname[1:] elif lname[:1] == 'C' and lname[:2] != 'CH': lname = 'K' + lname[1:] if lname[:2] == 'KN': lname = 'N' + lname[1:] elif lname[:2] == 'PH': lname = 'F' + lname[1:] elif lname[:3] in {'WIE', 'WEI'}: lname = 'V' + lname[1:] if self._german and lname[:1] in {'W', 'M', 'Y', 'Z'}: lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[ 1: ] code = lname[:1] # B. Postfix treatment if self._german: # moved from end of postfix treatment due to blocking if lname[-3:] == 'TES': lname = lname[:-3] elif lname[-2:] == 'TS': lname = lname[:-2] if lname[-3:] == 'TZE': lname = lname[:-3] elif lname[-2:] == 'ZE': lname = lname[:-2] if lname[-1:] == 'Z': lname = lname[:-1] elif lname[-2:] == 'TE': lname = lname[:-2] if lname[-1:] == 'R': lname = lname[:-1] + 'N' elif lname[-2:] in {'SE', 'CE'}: lname = lname[:-2] if lname[-2:] == 'SS': lname = lname[:-2] elif lname[-1:] == 'S': lname = lname[:-1] if not self._german: l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'} l4_repl = { 'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN', 'STON': 'SAON', } if lname[-5:] in l5_repl: lname = lname[:-5] + l5_repl[lname[-5:]] elif lname[-4:] in l4_repl: lname = lname[:-4] + l4_repl[lname[-4:]] if lname[-2:] in {'NG', 'ND'}: lname = lname[:-1] if not self._german and lname[-3:] in {'GAN', 'GEN'}: lname = lname[:-3] + 'A' + lname[-2:] # C. Infix Treatment lname = lname.replace('CK', 'C') lname = lname.replace('SCH', 'S') lname = lname.replace('DT', 'T') lname = lname.replace('ND', 'N') lname = lname.replace('NG', 'N') lname = lname.replace('LM', 'M') lname = lname.replace('MN', 'M') lname = lname.replace('WIE', 'VIE') lname = lname.replace('WEI', 'VEI') # D. Soundexing # code for X & Y are unspecified, but presumably are 2 & 0 lname = lname.translate(self._trans) lname = self._delete_consecutive_repeats(lname) code += lname[1:] code = code.replace('0', '') # rule 1 if self._max_length != -1: if len(code) < self._max_length: code += '0' * (self._max_length - len(code)) else: code = code[: self._max_length] return code
def encode(self, word): """Return the Daitch-Mokotoff Soundex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Daitch-Mokotoff Soundex value Examples -------- >>> pe = DaitchMokotoff() >>> sorted(pe.encode('Christopher')) ['494379', '594379'] >>> pe.encode('Niall') {'680000'} >>> pe.encode('Smith') {'463000'} >>> pe.encode('Schmidt') {'463000'} >>> sorted(DaitchMokotoff(max_length=20, ... zero_pad=False).encode('The quick brown fox')) ['35457976754', '3557976754'] .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ dms = [''] # initialize empty code list # uppercase, normalize, decompose, and filter non-A-Z word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = ''.join(c for c in word if c in self._uc_set) # Nothing to convert, return base case if not word: if self._zero_pad: return {'0' * self._max_length} return {'0'} pos = 0 while pos < len(word): # Iterate through _dms_order, which specifies the possible # substrings for which codes exist in the Daitch-Mokotoff coding for sstr in self._dms_order[word[pos]]: # pragma: no branch if word[pos:].startswith(sstr): # Having determined a valid substring start, retrieve the # code dm_val = self._dms_table[sstr] # Having retried the code (triple), determine the correct # positional variant (first, pre-vocalic, elsewhere) if pos == 0: dm_val = dm_val[0] elif ( pos + len(sstr) < len(word) and word[pos + len(sstr)] in self._uc_v_set ): dm_val = dm_val[1] else: dm_val = dm_val[2] # Build the code strings if isinstance(dm_val, tuple): dms = [_ + text_type(dm_val[0]) for _ in dms] + [ _ + text_type(dm_val[1]) for _ in dms ] else: dms = [_ + text_type(dm_val) for _ in dms] pos += len(sstr) break # Filter out double letters and _ placeholders dms = ( ''.join(c for c in self._delete_consecutive_repeats(_) if c != '_') for _ in dms ) # Trim codes and return set if self._zero_pad: dms = ( (_ + ('0' * self._max_length))[: self._max_length] for _ in dms ) else: dms = (_[: self._max_length] for _ in dms) return set(dms)
def encode(self, word): """Return the SfinxBis code for a word. Parameters ---------- word : str The word to transform Returns ------- tuple The SfinxBis value Examples -------- >>> pe = SfinxBis() >>> pe.encode('Christopher') ('K68376',) >>> pe.encode('Niall') ('N4',) >>> pe.encode('Smith') ('S53',) >>> pe.encode('Schmidt') ('S53',) >>> pe.encode('Johansson') ('J585',) >>> pe.encode('Sjöberg') ('#162',) .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ def _foersvensker(lokal_ordet): """Return the Swedish-ized form of the word. Parameters ---------- lokal_ordet : str Word to transform Returns ------- str Transformed word .. versionadded:: 0.1.0 """ lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN') lokal_ordet = lokal_ordet.replace('HIE', 'HJ') lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ') lokal_ordet = lokal_ordet.replace('SCH', 'SH') lokal_ordet = lokal_ordet.replace('QU', 'KV') lokal_ordet = lokal_ordet.replace('IO', 'JO') lokal_ordet = lokal_ordet.replace('PH', 'F') for i in self._harde_vokaler: lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J') for i in self._mjuka_vokaler: lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J') lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J') if 'H' in lokal_ordet: for i in self._uc_c_set: lokal_ordet = lokal_ordet.replace('H' + i, i) lokal_ordet = lokal_ordet.translate(self._substitutions) lokal_ordet = lokal_ordet.replace('Ð', 'ETH') lokal_ordet = lokal_ordet.replace('Þ', 'TH') lokal_ordet = lokal_ordet.replace('ß', 'SS') return lokal_ordet def _koda_foersta_ljudet(lokal_ordet): """Return the word with the first sound coded. Parameters ---------- lokal_ordet : str Word to transform Returns ------- str Transformed word .. versionadded:: 0.1.0 """ if ( lokal_ordet[0:1] in self._mjuka_vokaler or lokal_ordet[0:1] in self._harde_vokaler ): lokal_ordet = '$' + lokal_ordet[1:] elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'): lokal_ordet = 'J' + lokal_ordet[2:] elif ( lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in self._mjuka_vokaler ): lokal_ordet = 'J' + lokal_ordet[1:] elif lokal_ordet[0:1] == 'Q': lokal_ordet = 'K' + lokal_ordet[1:] elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset( self._mjuka_vokaler | self._harde_vokaler ): lokal_ordet = '#' + lokal_ordet[2:] elif ( lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._harde_vokaler ): lokal_ordet = 'K' + lokal_ordet[1:] elif ( lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set ): lokal_ordet = 'K' + lokal_ordet[1:] elif lokal_ordet[0:1] == 'X': lokal_ordet = 'S' + lokal_ordet[1:] elif ( lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._mjuka_vokaler ): lokal_ordet = 'S' + lokal_ordet[1:] elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'): lokal_ordet = '#' + lokal_ordet[3:] elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'): lokal_ordet = '#' + lokal_ordet[2:] elif ( lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in self._mjuka_vokaler ): lokal_ordet = '#' + lokal_ordet[2:] elif ( lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in self._mjuka_vokaler ): lokal_ordet = '#' + lokal_ordet[1:] return lokal_ordet # Steg 1, Versaler word = unicode_normalize('NFC', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('-', ' ') # Steg 2, Ta bort adelsprefix for adelstitel in self._adelstitler: while adelstitel in word: word = word.replace(adelstitel, ' ') if word.startswith(adelstitel[1:]): word = word[len(adelstitel) - 1 :] # Split word into tokens ordlista = word.split() # Steg 3, Ta bort dubbelteckning i början på namnet ordlista = [ self._delete_consecutive_repeats(ordet) for ordet in ordlista ] if not ordlista: # noinspection PyRedundantParentheses return ('',) # Steg 4, Försvenskning ordlista = [_foersvensker(ordet) for ordet in ordlista] # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214) ordlista = [ ''.join(c for c in ordet if c in self._uc_set) for ordet in ordlista ] # Steg 6, Koda första ljudet ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista] # Steg 7, Dela upp namnet i två delar rest = [ordet[1:] for ordet in ordlista] # Steg 8, Utför fonetisk transformation i resten rest = [ordet.replace('DT', 'T') for ordet in rest] rest = [ordet.replace('X', 'KS') for ordet in rest] # Steg 9, Koda resten till en sifferkod for vokal in self._mjuka_vokaler: rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest] rest = [ordet.translate(self._trans) for ordet in rest] # Steg 10, Ta bort intilliggande dubbletter rest = [self._delete_consecutive_repeats(ordet) for ordet in rest] # Steg 11, Ta bort alla "9" rest = [ordet.replace('9', '') for ordet in rest] # Steg 12, Sätt ihop delarna igen ordlista = [ ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest) ] # truncate, if max_length is set if self._max_length > 0: ordlista = [ordet[: self._max_length] for ordet in ordlista] return tuple(ordlista)
def encode(self, word): """Return the Standardized Phonetic Frequency Code (SPFC) of a word. Parameters ---------- word : str The word to transform Returns ------- str The SPFC value Raises ------ AttributeError Word attribute must be a string with a space or period dividing the first and last names or a tuple/list consisting of the first and last names Examples -------- >>> pe = SPFC() >>> pe.encode('Christopher Smith') '01160' >>> pe.encode('Christopher Schmidt') '01160' >>> pe.encode('Niall Smith') '01660' >>> pe.encode('Niall Schmidt') '01660' >>> pe.encode('L.Smith') '01960' >>> pe.encode('R.Miller') '65490' >>> pe.encode(('L', 'Smith')) '01960' >>> pe.encode(('R', 'Miller')) '65490' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ def _raise_word_ex(): """Raise an AttributeError. Raises ------ AttributeError Word attribute must be a string with a space or period dividing the first and last names or a tuple/list consisting of the first and last names .. versionadded:: 0.1.0 """ raise AttributeError( 'Word attribute must be a string with a space or period ' + 'dividing the first and last names or a tuple/list ' + 'consisting of the first and last names' ) if not word: return '' names = [] if isinstance(word, (str, text_type)): names = word.split('.', 1) if len(names) != 2: names = word.split(' ', 1) if len(names) != 2: _raise_word_ex() elif hasattr(word, '__iter__'): if len(word) != 2: _raise_word_ex() names = word else: _raise_word_ex() names = [ unicode_normalize( 'NFKD', text_type(_.strip().replace('ß', 'SS').upper()) ) for _ in names ] code = '' def _steps_one_to_three(name): """Perform the first three steps of SPFC. Parameters ---------- name : str Name to transform Returns ------- str Transformed name .. versionadded:: 0.1.0 """ # filter out non A-Z name = ''.join(_ for _ in name if _ in self._uc_set) # 1. In the field, convert DK to K, DT to T, SC to S, KN to N, # and MN to N for subst in self._substitutions: name = name.replace(subst[0], subst[1]) # 2. In the name field, replace multiple letters with a single # letter name = self._delete_consecutive_repeats(name) # 3. Remove vowels, W, H, and Y, but keep the first letter in the # name field. if name: name = name[0] + ''.join( _ for _ in name[1:] if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'} ) return name names = [_steps_one_to_three(_) for _ in names] # 4. The first digit of the code is obtained using PF1 and the first # letter of the name field. Remove this letter after coding. if names[1]: code += names[1][0].translate(self._pf1) names[1] = names[1][1:] # 5. Using the last letters of the name, use Table PF3 to obtain the # second digit of the code. Use as many letters as possible and remove # after coding. if names[1]: if names[1][-3:] in {'DRS', 'STN', 'PRS', 'STR'}: code += '7' names[1] = names[1][:-3] elif names[1][-2:] in {'MN', 'TR', 'SN', 'SR', 'TN', 'TD'}: code += '7' names[1] = names[1][:-2] else: code += names[1][-1].translate(self._pf3) names[1] = names[1][:-1] # 6. The third digit is found using Table PF2 and the first character # of the first name. Remove after coding. if names[0]: code += names[0][0].translate(self._pf2) names[0] = names[0][1:] # 7. The fourth digit is found using Table PF2 and the first character # of the name field. If no letters remain use zero. After coding remove # the letter. # 8. The fifth digit is found in the same manner as the fourth using # the remaining characters of the name field if any. for _ in range(2): if names[1]: code += names[1][0].translate(self._pf2) names[1] = names[1][1:] else: code += '0' return code
def encode(self, word): """Return the Haase Phonetik (numeric output) code for a word. While the output code is numeric, it is nevertheless a str. Parameters ---------- word : str The word to transform Returns ------- tuple The Haase Phonetik value as a numeric string Examples -------- >>> pe = Haase() >>> pe.encode('Joachim') ('9496',) >>> pe.encode('Christoph') ('4798293', '8798293') >>> pe.encode('Jörg') ('974',) >>> pe.encode('Smith') ('8692',) >>> pe.encode('Schmidt') ('8692', '4692') .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ def _after(word, pos, letters): """Return True if word[pos] follows one of the supplied letters. Parameters ---------- word : str Word to modify pos : int Position to examine letters : set Letters to check for Returns ------- bool True if word[pos] follows one of letters .. versionadded:: 0.3.0 """ if pos > 0 and word[pos - 1] in letters: return True return False def _before(word, pos, letters): """Return True if word[pos] precedes one of the supplied letters. Parameters ---------- word : str Word to modify pos : int Position to examine letters : set Letters to check for Returns ------- bool True if word[pos] precedes one of letters .. versionadded:: 0.3.0 """ if pos + 1 < len(word) and word[pos + 1] in letters: return True return False word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('Ä', 'AE') word = word.replace('Ö', 'OE') word = word.replace('Ü', 'UE') word = ''.join(c for c in word if c in self._uc_set) variants = [] if self._primary_only: variants = [word] else: pos = 0 if word[:2] == 'CH': variants.append(('CH', 'SCH')) pos += 2 len_3_vars = { 'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI', 'AUX': 'O', 'EUX': 'O', } while pos < len(word): if word[pos : pos + 4] == 'ILLE': variants.append(('ILLE', 'I')) pos += 4 elif word[pos : pos + 3] in len_3_vars: variants.append( (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]]) ) pos += 3 elif word[pos : pos + 2] == 'RB': variants.append(('RB', 'RW')) pos += 2 elif len(word[pos:]) == 3 and word[pos:] == 'EAU': variants.append(('EAU', 'O')) pos += 3 elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: if word[pos:] == 'O': variants.append(('O', 'OW')) else: variants.append(('A', 'AR')) pos += 1 else: variants.append((word[pos],)) pos += 1 variants = [''.join(letters) for letters in product(*variants)] def _haase_code(word): sdx = '' for i in range(len(word)): if word[i] in self._uc_v_set: sdx += '9' elif word[i] == 'B': sdx += '1' elif word[i] == 'P': if _before(word, i, {'H'}): sdx += '3' else: sdx += '1' elif word[i] in {'D', 'T'}: if _before(word, i, {'C', 'S', 'Z'}): sdx += '8' else: sdx += '2' elif word[i] in {'F', 'V', 'W'}: sdx += '3' elif word[i] in {'G', 'K', 'Q'}: sdx += '4' elif word[i] == 'C': if _after(word, i, {'S', 'Z'}): sdx += '8' elif i == 0: if _before( word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, ): sdx += '4' else: sdx += '8' elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): sdx += '4' else: sdx += '8' elif word[i] == 'X': if _after(word, i, {'C', 'K', 'Q'}): sdx += '8' else: sdx += '48' elif word[i] == 'L': sdx += '5' elif word[i] in {'M', 'N'}: sdx += '6' elif word[i] == 'R': sdx += '7' elif word[i] in {'S', 'Z'}: sdx += '8' sdx = self._delete_consecutive_repeats(sdx) return sdx encoded = tuple(_haase_code(word) for word in variants) if len(encoded) > 1: encoded_set = set() encoded_single = [] for code in encoded: if code not in encoded_set: encoded_set.add(code) encoded_single.append(code) return tuple(encoded_single) return encoded
def encode(self, word): """Return the Soundex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Soundex value Examples -------- >>> pe = Soundex() >>> pe.encode("Christopher") 'C623' >>> pe.encode("Niall") 'N400' >>> pe.encode('Smith') 'S530' >>> pe.encode('Schmidt') 'S530' >>> Soundex(max_length=-1).encode('Christopher') 'C623160000000000000000000000000000000000000000000000000000000000' >>> Soundex(max_length=-1, zero_pad=False).encode('Christopher') 'C62316' >>> Soundex(reverse=True).encode('Christopher') 'R132' >>> pe.encode('Ashcroft') 'A261' >>> pe.encode('Asicroft') 'A226' >>> pe_special = Soundex(var='special') >>> pe_special.encode('Ashcroft') 'A226' >>> pe_special.encode('Asicroft') 'A226' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ # uppercase, normalize, decompose, and filter non-A-Z out word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') if self._var == 'Census': if word[:3] in {'VAN', 'CON'} and len(word) > 4: return ( soundex( word, self._max_length, 'American', self._reverse, self._zero_pad, ), soundex( word[3:], self._max_length, 'American', self._reverse, self._zero_pad, ), ) if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3: return ( soundex( word, self._max_length, 'American', self._reverse, self._zero_pad, ), soundex( word[2:], self._max_length, 'American', self._reverse, self._zero_pad, ), ) # Otherwise, proceed as usual (var='American' mode, ostensibly) word = ''.join(c for c in word if c in self._uc_set) # Nothing to convert, return base case if not word: if self._zero_pad: return '0' * self._max_length return '0' # Reverse word if computing Reverse Soundex if self._reverse: word = word[::-1] # apply the Soundex algorithm sdx = word.translate(self._trans) if self._var == 'special': sdx = sdx.replace('9', '0') # special rule for 1880-1910 census else: sdx = sdx.replace('9', '') # rule 1 sdx = self._delete_consecutive_repeats(sdx) # rule 3 if word[0] in 'HW': sdx = word[0] + sdx else: sdx = word[0] + sdx[1:] sdx = sdx.replace('0', '') # rule 1 if self._zero_pad: sdx += '0' * self._max_length # rule 4 return sdx[: self._max_length]
def encode(self, word): """Return the Spanish Metaphone of a word. Parameters ---------- word : str The word to transform Returns ------- str The Spanish Metaphone code Examples -------- >>> pe = SpanishMetaphone() >>> pe.encode('Perez') 'PRZ' >>> pe.encode('Martinez') 'MRTNZ' >>> pe.encode('Gutierrez') 'GTRRZ' >>> pe.encode('Santiago') 'SNTG' >>> pe.encode('Nicolás') 'NKLS' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ def _is_vowel(pos): """Return True if the character at word[pos] is a vowel. Parameters ---------- pos : int Position to check for a vowel Returns ------- bool True if word[pos] is a vowel .. versionadded:: 0.3.0 """ return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'} word = unicode_normalize('NFC', text_type(word.upper())) meta_key = '' pos = 0 # do some replacements for the modified version if self._modified: word = word.replace('MB', 'NB') word = word.replace('MP', 'NP') word = word.replace('BS', 'S') if word[:2] == 'PS': word = word[1:] # simple replacements word = word.replace('Á', 'A') word = word.replace('CH', 'X') word = word.replace('Ç', 'S') word = word.replace('É', 'E') word = word.replace('Í', 'I') word = word.replace('Ó', 'O') word = word.replace('Ú', 'U') word = word.replace('Ñ', 'NY') word = word.replace('GÜ', 'W') word = word.replace('Ü', 'U') word = word.replace('B', 'V') word = word.replace('LL', 'Y') while len(meta_key) < self._max_length: if pos >= len(word): break # get the next character current_char = word[pos] # if a vowel in pos 0, add to key if _is_vowel(pos) and pos == 0: meta_key += current_char pos += 1 # otherwise, do consonant rules else: # simple consonants (unmutated) if current_char in { 'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V', 'L', 'Y', }: meta_key += current_char # skip doubled consonants if word[pos + 1 : pos + 2] == current_char: pos += 2 else: pos += 1 else: if current_char == 'C': # special case 'acción', 'reacción',etc. if word[pos + 1 : pos + 2] == 'C': meta_key += 'X' pos += 2 # special case 'cesar', 'cien', 'cid', 'conciencia' elif word[pos + 1 : pos + 2] in {'E', 'I'}: meta_key += 'Z' pos += 2 # base case else: meta_key += 'K' pos += 1 elif current_char == 'G': # special case 'gente', 'ecologia',etc if word[pos + 1 : pos + 2] in {'E', 'I'}: meta_key += 'J' pos += 2 # base case else: meta_key += 'G' pos += 1 elif current_char == 'H': # since the letter 'H' is silent in Spanish, # set the meta key to the vowel after the letter 'H' if _is_vowel(pos + 1): meta_key += word[pos + 1] pos += 2 else: meta_key += 'H' pos += 1 elif current_char == 'Q': if word[pos + 1 : pos + 2] == 'U': pos += 2 else: pos += 1 meta_key += 'K' elif current_char == 'W': meta_key += 'U' pos += 1 elif current_char == 'R': meta_key += 'R' pos += 1 elif current_char == 'S': if not _is_vowel(pos + 1) and pos == 0: meta_key += 'ES' pos += 1 else: meta_key += 'S' pos += 1 elif current_char == 'Z': meta_key += 'Z' pos += 1 elif current_char == 'X': if ( len(word) > 1 and pos == 0 and not _is_vowel(pos + 1) ): meta_key += 'EX' pos += 1 else: meta_key += 'X' pos += 1 else: pos += 1 # Final change from S to Z in modified version if self._modified: meta_key = meta_key.replace('S', 'Z') return meta_key
def run(self, text, options, path): """ Search the dictionary, walk the returned articles, then download articles that look like a match, and find MP3s in those articles that match the original input. """ assert options['voice'] == 'de', "Only German is supported." if len(text) > INPUT_MAXIMUM: raise IOError("Your input text is too long for Duden.") try: text.encode('us-ascii') except UnicodeEncodeError: raise IOError("Your input text uses characters that cannot be " "accurately searched for in the Duden.") text_search = text.replace('sz', '\u00df') self._logger.debug('Duden: Searching on "%s"', text_search) try: search_html = self.net_stream((SEARCH_FORM, dict(s=text_search)), require=dict(mime='text/html')).decode() except IOError as io_error: if getattr(io_error, 'code', None) == 404: raise IOError("Duden does not recognize this input.") else: raise text_lower = text.lower() text_lower_underscored_trailing = text_lower. \ replace(' ', '_').replace('-', '_') + '_' text_compressed = text.replace(' ', '').replace('-', '') text_lower_compressed = text_compressed.lower() text_deumlauted_compressed = text_compressed.replace('ae', 'a'). \ replace('oe', 'o').replace('ue', 'u') self._logger.debug('Got a search response; will follow links whose ' 'lowercased+compressed article segment equals "%s" ' 'or whose lowercased-but-still-underscored article ' 'segment begins with "%s"; looking for MP3s whose ' 'compressed guide says "%s" or "%s"', text_lower_compressed, text_lower_underscored_trailing, text_compressed, text_deumlauted_compressed) seen_article_urls = {} for article_match in RE_DETAIL.finditer(search_html): article_url = article_match.group(1) if article_url in seen_article_urls: continue seen_article_urls[article_url] = True segment = article_match.group(2) segment_lower = segment.lower() segment_lower_compressed = segment_lower.replace('_', '') if segment_lower_compressed == text_lower_compressed: self._logger.debug('Duden: lowered+compressed article segment ' 'for %s are same ("%s")', article_url, segment_lower_compressed) elif segment_lower.startswith(text_lower_underscored_trailing): self._logger.debug('Duden: lowered segment "%s" for %s begins ' 'with "%s"', segment_lower, article_url, text_lower_underscored_trailing) else: self._logger.debug('Duden: article segment for %s does not ' 'match; skipping', article_url) continue article_html = self.net_stream(article_url).decode() for mp3_match in RE_MP3.finditer(article_html): guide = mp3_match.group(3) guide = ''.join(HTML_PARSER.unescape(node) for node in BeautifulSoup(guide, 'html.parser').findAll(text=True)) guide_normalized = unicode_normalize( 'NFKD', self.modify(guide).replace('-', '').replace(' ', ''), ).encode('ASCII', 'ignore').decode() mp3_url = mp3_match.group(5) if guide_normalized == text_compressed or \ guide_normalized == text_deumlauted_compressed: self._logger.debug('Duden: found MATCHING MP3 at %s for ' '"%s", which normalized to "%s" and ' 'matches our input', mp3_url, guide, guide_normalized) self.net_download(path, mp3_url, require=dict(mime='audio/mpeg')) return else: self._logger.debug('Duden: found non-matching MP3 at %s ' 'for "%s", which normalized to "%s" ' 'and does not match our input', mp3_url, guide, guide_normalized) raise IOError("Duden does not have recorded audio for this word.")
def encode(self, word): """Return the Fuzzy Soundex code for a word. Parameters ---------- word : str The word to transform Returns ------- str The Fuzzy Soundex value Examples -------- >>> pe = FuzzySoundex() >>> pe.encode('Christopher') 'K6931' >>> pe.encode('Niall') 'N4000' >>> pe.encode('Smith') 'S5300' >>> pe.encode('Smith') 'S5300' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') if not word: if self._zero_pad: return '0' * self._max_length return '0' if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}: word = 'SS' + word[2:] elif word[:2] == 'GN': word = 'NN' + word[2:] elif word[:2] in {'HR', 'WR'}: word = 'RR' + word[2:] elif word[:2] == 'HW': word = 'WW' + word[2:] elif word[:2] in {'KN', 'NG'}: word = 'NN' + word[2:] if word[-2:] == 'CH': word = word[:-2] + 'KK' elif word[-2:] == 'NT': word = word[:-2] + 'TT' elif word[-2:] == 'RT': word = word[:-2] + 'RR' elif word[-3:] == 'RDT': word = word[:-3] + 'RR' word = word.replace('CA', 'KA') word = word.replace('CC', 'KK') word = word.replace('CK', 'KK') word = word.replace('CE', 'SE') word = word.replace('CHL', 'KL') word = word.replace('CL', 'KL') word = word.replace('CHR', 'KR') word = word.replace('CR', 'KR') word = word.replace('CI', 'SI') word = word.replace('CO', 'KO') word = word.replace('CU', 'KU') word = word.replace('CY', 'SY') word = word.replace('DG', 'GG') word = word.replace('GH', 'HH') word = word.replace('MAC', 'MK') word = word.replace('MC', 'MK') word = word.replace('NST', 'NSS') word = word.replace('PF', 'FF') word = word.replace('PH', 'FF') word = word.replace('SCH', 'SSS') word = word.replace('TIO', 'SIO') word = word.replace('TIA', 'SIO') word = word.replace('TCH', 'CHH') sdx = word.translate(self._trans) sdx = sdx.replace('-', '') # remove repeating characters sdx = self._delete_consecutive_repeats(sdx) if word[0] in {'H', 'W', 'Y'}: sdx = word[0] + sdx else: sdx = word[0] + sdx[1:] sdx = sdx.replace('0', '') if self._zero_pad: sdx += '0' * self._max_length return sdx[: self._max_length]
def encode(self, word): """Calculate the early version of the Henry code for a word. Parameters ---------- word : str The word to transform Returns ------- str The early Henry code Examples -------- >>> pe = HenryEarly() >>> pe.encode('Marchand') 'MRC' >>> pe.encode('Beaulieu') 'BL' >>> pe.encode('Beaumont') 'BM' >>> pe.encode('Legrand') 'LGR' >>> pe.encode('Pelletier') 'PLT' .. versionadded:: 0.3.0 .. versionchanged:: 0.3.6 Encapsulated in class """ word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join(c for c in word if c in self._uc_set) if not word: return '' # Rule Ia seems to be covered entirely in II # Rule Ib if word[0] in self._uc_vy_set: # Ib1 if ( word[1:2] in self._uc_c_set - {'M', 'N'} and word[2:3] in self._uc_c_set ) or ( word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set ): if word[0] == 'Y': word = 'I' + word[1:] # Ib2 elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set: if word[0] == 'E': word = 'A' + word[1:] elif word[0] in {'I', 'U', 'Y'}: word = 'E' + word[1:] # Ib3 elif word[:2] in self._diph: word = self._diph[word[:2]] + word[2:] # Ib4 elif word[1:2] in self._uc_vy_set and word[0] == 'Y': word = 'I' + word[1:] code = '' skip = 0 # Rule II for pos, char in enumerate(word): nxch = word[pos + 1 : pos + 2] prev = word[pos - 1 : pos] if skip: skip -= 1 elif char in self._uc_vy_set: code += char # IIc elif char == nxch: skip = 1 code += char elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}: continue # IIb elif char in self._simple: code += self._simple[char] elif char in {'C', 'G', 'P', 'Q', 'S'}: if char == 'C': if nxch in {'A', 'O', 'U', 'L', 'R'}: code += 'K' elif nxch in {'E', 'I', 'Y'}: code += 'S' elif nxch == 'H': if word[pos + 2 : pos + 3] in self._uc_vy_set: code += 'C' else: # CHR, CHL, etc. code += 'K' else: code += 'C' elif char == 'G': if nxch in {'A', 'O', 'U', 'L', 'R'}: code += 'G' elif nxch in {'E', 'I', 'Y'}: code += 'J' elif nxch == 'N': code += 'N' elif char == 'P': if nxch != 'H': code += 'P' else: code += 'F' elif char == 'Q': if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}: code += 'G' else: # QUA, QUO, etc. code += 'K' else: # S... if word[pos : pos + 6] == 'SAINTE': code += 'X' skip = 5 elif word[pos : pos + 5] == 'SAINT': code += 'X' skip = 4 elif word[pos : pos + 3] == 'STE': code += 'X' skip = 2 elif word[pos : pos + 2] == 'ST': code += 'X' skip = 1 elif nxch in self._uc_c_set: continue else: code += 'S' # IId elif char == 'H' and prev in self._uc_c_set: continue elif char in self._uc_c_set - { 'L', 'R', } and nxch in self._uc_c_set - {'L', 'R'}: continue elif char == 'L' and nxch in {'M', 'N'}: continue elif ( char in {'M', 'N'} and prev in self._uc_vy_set and nxch in self._uc_c_set ): continue # IIa else: code += char # IIe1 if code[-4:] in {'AULT', 'EULT', 'OULT'}: code = code[:-2] # The following are blocked by rules above # elif code[-4:-3] in _vows and code[-3:] == 'MPS': # code = code[:-3] # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', # 'NS', 'NT'}: # code = code[:-2] elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set: code = code[:-1] # IIe2 elif code[-2:-1] in self._uc_vy_set and code[-1:] in { 'D', 'M', 'N', 'S', 'T', }: code = code[:-1] elif code[-2:] == 'ER': code = code[:-1] # Drop non-initial vowels code = code[:1] + code[1:].translate( {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''} ) if self._max_length != -1: code = code[: self._max_length] return code
def encode(self, word): """Return the Kölner Phonetik (numeric output) code for a word. While the output code is numeric, it is still a str because 0s can lead the code. Parameters ---------- word : str The word to transform Returns ------- str The Kölner Phonetik value as a numeric string Example ------- >>> pe = Koelner() >>> pe.encode('Christopher') '478237' >>> pe.encode('Niall') '65' >>> pe.encode('Smith') '862' >>> pe.encode('Schmidt') '862' >>> pe.encode('Müller') '657' >>> pe.encode('Zimmermann') '86766' .. versionadded:: 0.1.0 .. versionchanged:: 0.3.6 Encapsulated in class """ def _after(word, pos, letters): """Return True if word[pos] follows one of the supplied letters. Parameters ---------- word : str The word to check pos : int Position within word to check letters : str Letters to confirm precede word[pos] Returns ------- bool True if word[pos] follows a value in letters .. versionadded:: 0.1.0 """ return pos > 0 and word[pos - 1] in letters def _before(word, pos, letters): """Return True if word[pos] precedes one of the supplied letters. Parameters ---------- word : str The word to check pos : int Position within word to check letters : str Letters to confirm follow word[pos] Returns ------- bool True if word[pos] precedes a value in letters .. versionadded:: 0.1.0 """ return pos + 1 < len(word) and word[pos + 1] in letters sdx = '' word = unicode_normalize('NFKD', text_type(word.upper())) word = word.replace('ß', 'SS') word = word.replace('Ä', 'AE') word = word.replace('Ö', 'OE') word = word.replace('Ü', 'UE') word = ''.join(c for c in word if c in self._uc_set) # Nothing to convert, return base case if not word: return sdx for i in range(len(word)): if word[i] in self._uc_v_set: sdx += '0' elif word[i] == 'B': sdx += '1' elif word[i] == 'P': if _before(word, i, {'H'}): sdx += '3' else: sdx += '1' elif word[i] in {'D', 'T'}: if _before(word, i, {'C', 'S', 'Z'}): sdx += '8' else: sdx += '2' elif word[i] in {'F', 'V', 'W'}: sdx += '3' elif word[i] in {'G', 'K', 'Q'}: sdx += '4' elif word[i] == 'C': if _after(word, i, {'S', 'Z'}): sdx += '8' elif i == 0: if _before( word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'} ): sdx += '4' else: sdx += '8' elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): sdx += '4' else: sdx += '8' elif word[i] == 'X': if _after(word, i, {'C', 'K', 'Q'}): sdx += '8' else: sdx += '48' elif word[i] == 'L': sdx += '5' elif word[i] in {'M', 'N'}: sdx += '6' elif word[i] == 'R': sdx += '7' elif word[i] in {'S', 'Z'}: sdx += '8' sdx = self._delete_consecutive_repeats(sdx) if sdx: sdx = sdx[:1] + sdx[1:].replace('0', '') return sdx