def test_length(self): parsed = self.reader.parse_transcript("baːwɑ") assert parsed[0] == Token("p(p, b)") assert parsed[1] == Token( "aː(aː, ɑː)") # and NOT the same as a(a, ɑ, ə, æ) assert parsed[2] == Token("w") assert parsed[3] == Token("a(a, ɑ, ə, æ)")
def test_noncombining_forms_are_identical_to_combining(self): nfd = "".join([ unicodedata.lookup("LATIN SMALL LETTER A"), unicodedata.lookup("COMBINING ACUTE ACCENT"), ]) nfc = unicodedata.lookup("LATIN SMALL LETTER A WITH ACUTE") assert Token(nfc) == Token(nfd)
def test_allophone(self): parsed = self.reader.parse_transcript("uwʊl") assert parsed[0] == Token("u(u, ʊ)") assert parsed[1] == Token("w") assert parsed[2] == Token("u(u, ʊ)") assert parsed[3] == Token("l") assert parsed[0] == parsed[2]
def test_match_inventory_allophone(self): inv = [ Token("i(i, í, ì, ị, ỉ, ĩ)"), ] o = Ortheme("<x>=/í/", inventory=inv) assert o.graphemes == [Token('x')] assert o.phonemes == inv
def test_space_ends_word(self): # "cit ʔa" being parsed as # 440. iˑ vowel # 441. c consonant # 442. i missing + # 443. t ʔ missing * # 444. a vowel f = FileReader() f.data['consonants'] = f.parse_inventory("c, t(t, tⁿ)", 'consonant') f.data['vowels'] = f.parse_inventory("iˑ, a", 'vowel') # add known missings f.known_missings = [ MissingToken("i", known_missing=True), MissingToken("ʔ", known_missing=True), ] transcript = f.standardise("iˑcit ʔa") parsed = f.parse_transcript(transcript) assert parsed[0] == Token("iˑ") assert parsed[1] == Token("c") assert parsed[2] == MissingToken("i", known_missing=True) assert parsed[3] == Token("t(t, tⁿ)") assert parsed[4] == Token(" ") assert parsed[5] == MissingToken("ʔ", known_missing=True) assert parsed[6] == Token("a")
def test_15(self): o = Ortheme("<i(i, í, ì, ị, ỉ, ĩ)>-<y(y, ý, ỳ, ỷ, ỹ, ỵ)>=/i/") assert o.graphemes == [ Token("i(i, í, ì, ị, ỉ, ĩ)"), Token("y(y, ý, ỳ, ỷ, ỹ, ỵ)") ] assert o.phonemes == [Token('i')]
def test_missing_token(self): string = "ɔlaɣ" parsed = self.reader.parse_transcript(string) assert len(parsed) == len(string) assert parsed[0] == MissingToken("ɔ") assert parsed[1] == Token("l") assert parsed[2] == Token("a(a, ɑ, ə, æ)") assert parsed[3] == MissingToken("ɣ")
def test_duplication(self): string = "llall" parsed = self.reader.parse_transcript(string) assert len(parsed) == len(string) assert parsed[0] == Token("l") assert parsed[1] == Token("l") assert parsed[2] == Token("a(a, ɑ, ə, æ)") assert parsed[3] == Token("l") assert parsed[4] == Token("l")
def test_slippage_and_duplication(self): string = "ɔlalɣ" parsed = self.reader.parse_transcript(string) assert len(parsed) == len(string) assert parsed[0] == MissingToken("ɔ") assert parsed[1] == Token("l") assert parsed[2] == Token("a(a, ɑ, ə, æ)") assert parsed[3] == Token("l") assert parsed[4] == MissingToken("ɣ")
def test_rhotic_hook(self): # lia˞u˞ f = FileReader() f.data['consonants'] = f.parse_inventory("l", 'consonant') f.data['vowels'] = f.parse_inventory("i, au(au, a˞u˞)", 'vowels') transcript = f.standardise('lia˞u˞') parsed = f.parse_transcript(transcript) assert parsed[0] == Token("l") assert parsed[1] == Token("i") assert parsed[2] == Token("au(au, a˞u˞)")
def test_identified_missing_characters(self): expected_missings = [ Token("oː"), Token("ɣ"), Token("ɔ"), Token("ç"), MissingToken('ɂ') ] for m in self.reader.errors: assert m in expected_missings, '%s is not in expected_missings' % m
def test_shilluk(self): f = FileReader() f.data['consonants'] = f.parse_inventory("ŋ", 'consonant') f.data['vowels'] = f.parse_inventory( "ɪ(ɪ́, ɪ̄, ɪ̀, ɪ̌, ɪ̂, ɪ̂́), a(á, ā, à, ǎ, â), ɪː(ɪ́ː, ɪ̄ː, ɪ̀ː, ɪ̌ː, ɪ̂ː, ɪ̂́ː)", 'vowels') transcript = f.standardise("ɪ̂́ŋ-à") parsed = f.parse_transcript(transcript) assert parsed[0] == Token("ɪ(ɪ́, ɪ̄, ɪ̀, ɪ̌, ɪ̂, ɪ̂́)") assert parsed[1] == Token("ŋ") assert parsed[2] == Token("-") assert parsed[3] == Token("a(á, ā, à, ǎ, â)")
def test_toIPA_uses_lowercase(self): text = self.reader.toIPA("PTKCH") expected = [ Token('p(p, pʰ, pʷ)'), Token('t'), # lost 't̪' as n:n Token('k(k, c, kʰ, cʰ, kʷ)'), Token('tʃ'), # NOT <k(...)> and <h> ] assert len(expected) == len(text) for i, e in enumerate(expected): assert e == text[i], 'Mismatch %r : %r' % (e, text[i])
def test_ellipsis(self): # an error with ellipsis. [...] f = FileReader() f.data['consonants'] = f.parse_inventory("l n", 'consonant') f.data['vowels'] = f.parse_inventory("", 'vowels') f.known_missings.update(f.parse_list(["/[...]/"])) transcript = f.standardise("l [...] n") parsed = f.parse_transcript(transcript) assert parsed[0] == Token("l") assert parsed[1] == Token(" ") assert parsed[2] == Token("[...]") assert parsed[3] == Token(" ") assert parsed[4] == Token("n")
def test_one(self): # parsed as #130. j(j, ɲ) consonant #131. o(o, ó, ò, ô, ŏ̀) vowel #132. ↗ other #133. óʔẁ̰ missing * #134. punctuation transcript = self.f.standardise("jó↗óʔẁ̰") parsed = self.f.parse_transcript(transcript) assert parsed[0] == Token("j(j, ɲ)") assert parsed[1] == Token("o(o, ó, ò, ô, ŏ̀)") assert parsed[2] == MissingToken("↗") assert parsed[3] == Token("o(o, ó, ò, ô, ŏ̀)") assert parsed[4] == Token("ʔw̰(ʔw̰, ʔŋ̰ʷ, ʔẁ̰)")
def _fulltest(self, s, allophones, names): """Runs a full set of tests on the string `s` 1. tests that the .raw value matches the original input `s` 2. tests that the .token matches the original input `s` 3. tests that the __repr__ is correct 4. tests that the allophones are correctly extracted (list expected allophones in parameter `allophones`. Can be None) 5. tests that the identified token names are correct (list expected names in parameter `names`) 6. tests that variants list matches the expected """ # standardise first as that's what Token does. s = unicodedata.normalize("NFC", s) if allophones: allophones = [unicodedata.normalize("NFC", a) for a in allophones] t = Token(s) try: # 1. tests that the .raw value matches the original input `s` assert t.raw == s, "Raw value %r != expected %r" % (t.raw, s) # 2. tests that the .token matches the original input `s` assert t.token == s.split("(")[0], \ "Token %r != expected %r" % (t.token, s) # 3. tests that the __repr__ is correct assert repr( t) == '<%s>' % s, "Repr %r != expected <%r>" % (repr(t), s) # 4. tests that the allophones are correctly extracted assert t.allophones == allophones, \ "Allophones %r != %r" % (t.allophones, allophones) # 5. tests that the identified token names are correct assert len(t.names) == len(names), \ "Uneven amount of names: %r != %r" % (t.names, names) for i, name in enumerate(t.names): assert name == names[i], "Unexpected name %d, %s != %s" % ( i, name, names[i]) # 6. tests that variants list matches the expected if t.allophones is None: assert len(t.variants) == 1 assert t.variants == [t.token] else: assert len(t.variants) == len(t.allophones) for a in t.allophones: assert a in t.variants except AssertionError as e: # pragma: no cover t.debug() raise e return True
def test_maximal_error(self): # should identify the missing token as "o:" not ":" transcript = 'oːlal' f = FileReader() f.data['consonants'] = f.parse_inventory("l, ɭ, ʎ, r(r, ɾ, ɹ)", 'consonant') f.data['vowels'] = f.parse_inventory( "a(a, ɑ, ə, æ), o(o, ɒ), u(u, ʊ), uː", 'vowel') transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == MissingToken("oː") assert parsed[1] == Token("l") assert parsed[2] == Token("a(a, ɑ, ə, æ)") assert parsed[3] == Token("l")
def test_5pc(self): res = [c for c in self.cov if c.ppercent == 5][0] assert res.ppercent == 5 assert res.position == 42 # 843 / 20 = 42.15 assert res.observed == 15 assert res.opercent == (15 / res.total_inv) * 100 assert res.transcript[-1] == Token('r(r, ɾ, ɹ)')
def test_sandawe(self): # ǁ’àká being parsed as: # 489. ‖ punctuation # 490. ’ missing * # 491. a(a, á, à, ǎ, â) vowel # 492. k consonant # # ǁ’ is in the inventory but I think it's being overriden by the default ǁ in boundary tokens f = FileReader() f.data['consonants'] = f.parse_inventory("k, ǁ’", 'consonant') f.data['vowels'] = f.parse_inventory("a(a, á, à, ǎ, â)", 'vowels') transcript = f.standardise('ǁ’àká') parsed = f.parse_transcript(transcript) assert parsed[0] == Token("ǁ’") assert parsed[1] == Token("a(a, á, à, ǎ, â)") assert parsed[2] == Token("k") assert parsed[3] == Token("a(a, á, à, ǎ, â)")
def test_danish_overextension(self): # being parsed as ... MissingToken("də") not MissingToken("d"), # Token("ə") transcript = 'b̥lɛːsdə' f = FileReader() f.data['consonants'] = f.parse_inventory( "b̥(b̥, b̥ʰ), d̥(d̥, d̥s), s, l(l, l̩)", 'consonant') f.data['vowels'] = f.parse_inventory("e(e, eː), ɛ(ɛ, ɛː), a, ɑ, ə", 'vowel') transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == MissingToken("b̥(b̥, b̥ʰ)") assert parsed[1] == Token("l(l, l̩)") assert parsed[2] == Token("ɛ(ɛ, ɛː)") assert parsed[3] == Token("s") assert parsed[4] == MissingToken("d") assert parsed[5] == Token("ə")
def test_sandawe_2(self): # ǀ’ùsù being parsed as: # 67. | punctuation # 68. ’ missing * # 69. u(u, ú, ù, ǔ, û) vowel # 70. s consonant # 71. u(u, ú, ù, ǔ, û) vowel # # ǀ’ in inventory but I think it's being overriden by the default ǁ in boundary tokens f = FileReader() f.data['consonants'] = f.parse_inventory("s, ǀ’, x", 'consonant') f.data['vowels'] = f.parse_inventory("u(u, ú, ù, ǔ, û)", 'vowels') transcript = f.standardise('ǀ’ùsù') parsed = f.parse_transcript(transcript) assert parsed[0] == Token("ǀ’") assert parsed[1] == Token("u(u, ú, ù, ǔ, û)") assert parsed[2] == Token("s") assert parsed[3] == Token("u(u, ú, ù, ǔ, û)")
def test_s_COMBINING_INVERTED_BRIDGE_BELOW_allophone(self): # the reason this failed was that s̺ isn't in the allophones # so s̺ didn't match anything. This is fixed at the Token level # and checked in test_Token.test_initial_char_in_allophones f = FileReader() f.data['consonants'] = f.parse_inventory("s̺(s, s̬, s̺)", 'consonant') parsed = f.parse_transcript(f.standardise('s̺')) assert len(parsed) == 1 assert parsed[0] == Token('s̺(s, s̬, s̺)')
def test_basaa_combining_n_only_attached_to_preceeding(self): # pêⁿbà being parsed as: # 43. p consonant # 44. e(e, é, è, ě, ê) vowel # 45. hⁿ missing * # 46. b missing * # 47. a(a, á, à, ǎ, â) vowel f = FileReader() f.data['consonants'] = f.parse_inventory("p, h, ⁿb", 'consonant') f.data['vowels'] = f.parse_inventory( "e(e, é, è, ě, ê), a(a, á, à, ǎ, â)", 'vowels') transcript = f.standardise('pêhⁿbà') parsed = f.parse_transcript(transcript) assert parsed[0] == Token("p") assert parsed[1] == Token("e(e, é, è, ě, ê)") assert parsed[2] == Token("h") assert parsed[3] == Token("ⁿb") assert parsed[4] == Token("a(a, á, à, ǎ, â)")
def test_initial_char_in_allophones(self): # see test_regression.test_s_COMBINING_INVERTED_BRIDGE_BELOW_allophone # and test_regression.test_galician t = Token("b(b̥, β̞)") assert len(t.allophones) == 2 assert "b̥" in t.allophones assert "β̞" in t.allophones assert len(t.variants) == 3 assert "b" in t.variants assert "β̞" in t.variants assert "b̥" in t.variants
def test_inventory_and_orthography_token_matches(self): # The orthographies tend to be underspecified e.g.: # p(p, pʰ, pʷ) is in the inventory but the orthography has # <p> = /p/ # ... so we should first match the full and if not found then see # if we match the short form. assert 'p' in self.reader.get_variants() # ... and in orthography assert Ortheme('<p> = /p(p, pʰ, pʷ)/') in self.reader.orthography #...and in variants assert Token('p(p, pʰ, pʷ)') == self.reader.get_variants()['p']
def test_combining_in_others(self): # Setswana's 'bó̝kɔ̝̀ːnì' was being parsed as: # # 8. b consonant # 9. o̝(o̝, ò̝, ó̝, ô̝, ǒ̝) vowel # 10. k consonant # 11. ɔ̝̀ː missing * # 12. n consonant # 13. i(i, ì, í, î, ǐ, ì̞, í̞) vowel # # i.e. in token 11 the combining character of double triangle "ː" is # merged to the character 'ɔ̝̀'. 'ɔ̝̀' is IN the inventory, but 'ɔ̝̀ː' is NOT # so this gets flagged as an error. "ː" is in other symbols and is # currently not being recognized as such f = FileReader() f.data['consonants'] = f.parse_inventory("b, k, n", 'consonant') f.data['vowels'] = f.parse_inventory( "o̝(o̝, ò̝, ó̝, ô̝, ǒ̝), i(i, ì, í, î, ǐ, ì̞, í̞)", 'vowel') f.known_missings.update(f.parse_list(['/ɔ̝̀/'])) f.other_symbols.update(f.parse_inventory('ː', 'other')) # Other: ː transcript = 'bó̝kɔ̝̀ːnì' transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == Token("b"), parsed assert parsed[1] == Token("o̝(o̝, ò̝, ó̝, ô̝, ǒ̝)"), parsed assert parsed[2] == Token("k"), parsed assert parsed[3] == MissingToken("ɔ̝̀"), parsed assert parsed[4] == Token("ː"), parsed assert parsed[5] == Token("n"), parsed assert parsed[6] == Token("i(i, ì, í, î, ǐ, ì̞, í̞)"), parsed
def test_basaa_ignored_superscript_n(self): # gáː ⁿbɛ̀βí being parsed as # # 9. h consonant # 10. a(a, á, à, ǎ, â) vowel # 11. ŋ(ŋ, ŋ́, ŋ̀) consonant # 12. g missing * # 13. aː(aː, áː, àː, ǎː, âː) vowel # 14. ⁿ missing * # 15. b missing * # 16. ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂) vowel # 17. β consonant # 18. i(i, í, ì, ǐ, î) vowel # # i.e. 14 should be combined with 15 = ⁿb f = FileReader() f.data['consonants'] = f.parse_inventory("gʷ, ⁿb, ⁿg, β", 'consonant') f.data['vowels'] = f.parse_inventory( """ a(a, á, à, ǎ, â), aː(aː, áː, àː, ǎː, âː), e(e, é, è, ě, ê), ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂), i(i, í, ì, ǐ, î), """, 'vowels') transcript = f.standardise('gáː ⁿbɛ̀βí') parsed = f.parse_transcript(transcript) assert parsed[0] == MissingToken("g") # known missing assert parsed[1] == Token("aː(aː, áː, àː, ǎː, âː)") assert parsed[2] == Token(" ") # was incorrect -- should be SPACE. assert parsed[3] == Token("ⁿb") # was incorrect assert parsed[4] == Token("ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂)") assert parsed[5] == Token("β") assert parsed[6] == Token("i(i, í, ì, ǐ, î)")
def test_overmatching(self): # this was being parsed as: # [<h>, <ao>, <MissingToken: ̯>, <a>] # .. should be: # [<h>, <a>, <o ̯>, <a>] # think this is only a problem where the full inventory # when a word is encountered in the form of: # 123 # and the tokens "1", "12" and "23" exist. f = FileReader() f.data['consonants'] = f.parse_inventory("h", 'consonant') f.data['vowels'] = f.parse_inventory( "i, e(e, e̯), ɜ, a, ɔ, o(o, o̯), u, ao", 'vowel') transcript = 'hao̯a' transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == Token("h") assert parsed[1] == Token("a") assert parsed[2] == Token("o(o, o̯)") assert parsed[3] == Token("a")
def test_get_missing(self): # if missing char is in the known_missings, then it returns a MissingToken # with known_missing set to True x = self.reader.get_missing('x') assert x == MissingToken('x') assert x.known_missing == True # if missing char is in the default_tokens, then it returns a Token # with phoneme_type="default" dot = self.reader.get_missing(".") assert dot == Token('.') assert dot.is_missing == False assert dot.phoneme_type == 'default' # otherwise it's just Missing nine = self.reader.get_missing('9') assert nine == MissingToken('9') assert nine.known_missing == False
def test_upper_xumi(self): # an error with large other symbols being identified as single ones. # e.g. here "||" is being identified as two "|" i.e. "|", "|" f = FileReader() f.data['consonants'] = f.parse_inventory("l H", 'consonant') f.data['vowels'] = f.parse_inventory("i", 'vowels') f.known_missings.update(f.parse_list(["/|/", "/||/"])) transcript = f.standardise("li || H") parsed = f.parse_transcript(transcript) assert parsed[0] == Token("l") assert parsed[1] == Token("i") assert parsed[2] == Token(" ") assert parsed[3] == Token("||") assert parsed[4] == Token(" ") assert parsed[5] == Token("H")