def test_is_letter_non_ipa(self): """ is_letter should return False for non-IPA letters in strict mode and True in non-strict mode. """ for char in ['ʣ', 'ɫ', 'g', 'Γ', 'F', 'Lj']: self.assertFalse(is_letter(char, strict=True)) self.assertTrue(is_letter(char, strict=False))
def get_vector(token): """ Return the vector representation (an entry from VECTORS) of an IPA token. Raise an exception if VECTORS is not yet set. """ token = normalise_token(token) try: return VECTORS[token] except KeyError: pass letters = ''.join([char for char in token if is_letter(char, False)]) if len(letters) > 1: sub_tokens = [] for index, sub_token in enumerate(tokenise(token)): if sub_token in VECTORS: sub_tokens.append(sub_token) elif letters[index] in VECTORS: sub_tokens.append(sub_token) else: break else: # no break # warnings.warn('neural-net: {} → {}'.format( # token, ' '.join(sub_tokens))) sub_vectors = [VECTORS[sub_token] for sub_token in sub_tokens] return sum(sub_vectors) / len(sub_vectors) try: return VECTORS[letters] except KeyError: warnings.warn('neural-net: cannot recognise {}'.format(token)) raise
def get_vector_key(token): """ Return the key that maps to the vector representation of a phoneme (i.e. IPA token). Raise an exception if the module-level model is not set. """ token = normalise_token(token) if token in model.wv: return token if token == '': return '\0' alt_token = ''.join([char for char in token if is_letter(char, False)]) if alt_token in model.wv: return alt_token warnings.warn('phon2vec: cannot recognise {}'.format(token)) return '\0'
def convert_ipa_token(token): """ Convert an IPA token into an ASJP token or raise (Assertion|Index)Error if the input does not constitute a valid IPA token. Helper for ipa2asjp(ipa_seq). """ output = [] has_tie_bar = False for char in token: if is_letter(char): if has_tie_bar: affricate = output[-1] + char if affricate in chart.ipa: output[-1] = chart.ipa[affricate] has_tie_bar = False else: for asjp_char in chart.ipa[char]: output.append(asjp_char) elif is_tie_bar(char): has_tie_bar = True elif char == 'n̪'[1] and output[-1] == 'n': output[-1] = chart.ipa['n̪'] elif char in chart.ipa: asjp_char = chart.ipa[char] if asjp_char in chart.asjp_diacritics: output[-1] += asjp_char else: output.append(asjp_char) assert 1 <= len(output) <= 3 if len(output) != 1: output.append('~' if len(output) == 2 else '$') return ''.join(output)
def sanitise_token(token, keep_digits=False): """ Sanitise a string by (1) ensuring its chars' normal form comply to the IPA spec; (2) replacing common substitutes with their IPA equivalents; (3) excluding chars that are not IPA letters, diacritics, tie bars, or length markers. If keep_digits is set to True, do not replace digits with Chao letters. This method leverages ipatok functions that are not in the package's public API. """ if not keep_digits: token = replace_digits_with_chao(token) token = replace_substitutes(normalise(token)) return ''.join([ char for char in token if is_letter(char, strict=False) \ or is_tie_bar(char) \ or is_diacritic(char, strict=False) \ or is_length(char) \ or is_tone(char, strict=False) or char in '¹²³⁴⁵'])
def tokenise_word(string, strict=False, replace=False, tones=False, unknown=False): """ Tokenise the string into a list of tokens or raise ValueError if it cannot be tokenised (relatively) unambiguously. The string should not include whitespace, i.e. it is assumed to be a single word. If strict=False, allow non-standard letters and diacritics, as well as initial diacritic-only tokens (e.g. pre-aspiration). If replace=True, replace some common non-IPA symbols with their IPA counterparts. If tones=False, ignore tone symbols. If unknown=False, ignore symbols that cannot be classified into a relevant category. Helper for tokenise(string, ..). """ string = normalise(string) if replace: string = ipa.replace_substitutes(string) tokens = [] for index, char in enumerate(string): if ipa.is_letter(char, strict): if tokens and ipa.is_tie_bar(string[index-1]): tokens[-1] += char else: tokens.append(char) elif ipa.is_tie_bar(char): if not tokens: raise ValueError(f'The string starts with a tie bar: {string}') tokens[-1] += char elif ipa.is_diacritic(char, strict) or ipa.is_length(char): if tokens: tokens[-1] += char else: if strict: raise ValueError( f'The string starts with a diacritic: {string}' ) else: tokens.append(char) elif tones and ipa.is_tone(char, strict): if unicodedata.combining(char): if not tokens: raise ValueError( f'The string starts with an accent mark: {string}' ) tokens[-1] += char elif tokens and ipa.is_tone(tokens[-1][-1], strict): tokens[-1] += char else: tokens.append(char) elif ipa.is_suprasegmental(char, strict): pass else: if strict: raise ValueError( f'Unrecognised char: {char} ({ unicodedata.name(char)})' ) elif unknown: tokens.append(char) else: pass return tokens