def normalise_token(token): """ Remove tie bars (e.g. t͡ʃ → tʃ) and diacritics marking non-syllabic vowels (e.g. aɪ̯ → aɪ) from a token. This ensures a single (arbitrarily chosen) "normal" form of tokens with such symbols. """ return ''.join( [char for char in token if not is_tie_bar(char) and char != '◌̯'[1]])
def convert_ipa_token(token): """ Convert an IPA token into an ASJP token or raise (Assertion|Index)Error if the input does not constitute a valid IPA token. Helper for ipa2asjp(ipa_seq). """ output = [] has_tie_bar = False for char in token: if is_letter(char): if has_tie_bar: affricate = output[-1] + char if affricate in chart.ipa: output[-1] = chart.ipa[affricate] has_tie_bar = False else: for asjp_char in chart.ipa[char]: output.append(asjp_char) elif is_tie_bar(char): has_tie_bar = True elif char == 'n̪'[1] and output[-1] == 'n': output[-1] = chart.ipa['n̪'] elif char in chart.ipa: asjp_char = chart.ipa[char] if asjp_char in chart.asjp_diacritics: output[-1] += asjp_char else: output.append(asjp_char) assert 1 <= len(output) <= 3 if len(output) != 1: output.append('~' if len(output) == 2 else '$') return ''.join(output)
def sanitise_token(token, keep_digits=False): """ Sanitise a string by (1) ensuring its chars' normal form comply to the IPA spec; (2) replacing common substitutes with their IPA equivalents; (3) excluding chars that are not IPA letters, diacritics, tie bars, or length markers. If keep_digits is set to True, do not replace digits with Chao letters. This method leverages ipatok functions that are not in the package's public API. """ if not keep_digits: token = replace_digits_with_chao(token) token = replace_substitutes(normalise(token)) return ''.join([ char for char in token if is_letter(char, strict=False) \ or is_tie_bar(char) \ or is_diacritic(char, strict=False) \ or is_length(char) \ or is_tone(char, strict=False) or char in '¹²³⁴⁵'])
def tokenise_word(string, strict=False, replace=False, tones=False, unknown=False): """ Tokenise the string into a list of tokens or raise ValueError if it cannot be tokenised (relatively) unambiguously. The string should not include whitespace, i.e. it is assumed to be a single word. If strict=False, allow non-standard letters and diacritics, as well as initial diacritic-only tokens (e.g. pre-aspiration). If replace=True, replace some common non-IPA symbols with their IPA counterparts. If tones=False, ignore tone symbols. If unknown=False, ignore symbols that cannot be classified into a relevant category. Helper for tokenise(string, ..). """ string = normalise(string) if replace: string = ipa.replace_substitutes(string) tokens = [] for index, char in enumerate(string): if ipa.is_letter(char, strict): if tokens and ipa.is_tie_bar(string[index-1]): tokens[-1] += char else: tokens.append(char) elif ipa.is_tie_bar(char): if not tokens: raise ValueError(f'The string starts with a tie bar: {string}') tokens[-1] += char elif ipa.is_diacritic(char, strict) or ipa.is_length(char): if tokens: tokens[-1] += char else: if strict: raise ValueError( f'The string starts with a diacritic: {string}' ) else: tokens.append(char) elif tones and ipa.is_tone(char, strict): if unicodedata.combining(char): if not tokens: raise ValueError( f'The string starts with an accent mark: {string}' ) tokens[-1] += char elif tokens and ipa.is_tone(tokens[-1][-1], strict): tokens[-1] += char else: tokens.append(char) elif ipa.is_suprasegmental(char, strict): pass else: if strict: raise ValueError( f'Unrecognised char: {char} ({ unicodedata.name(char)})' ) elif unknown: tokens.append(char) else: pass return tokens
def test_is_tie_bar(self): """ is_tie_bar should return True for IPA tie bars and False for other IPA symbols. """ self.assertTrue(is_tie_bar('◌͡'[1])) self.assertTrue(is_tie_bar('◌͜'[1])) self.assertFalse(is_tie_bar('ʋ')) self.assertFalse(is_tie_bar('‿')) [self.assertTrue(is_tie_bar(x)) for x in chart.tie_bars] [self.assertFalse(is_tie_bar(x)) for x in chart.consonants] [self.assertFalse(is_tie_bar(x)) for x in chart.vowels] [self.assertFalse(is_tie_bar(x)) for x in chart.diacritics] [self.assertFalse(is_tie_bar(x)) for x in chart.suprasegmentals] [self.assertFalse(is_tie_bar(x)) for x in chart.lengths] [self.assertFalse(is_tie_bar(x)) for x in chart.tones]