def test_space_ends_word(self): # "cit ʔa" being parsed as # 440. iˑ vowel # 441. c consonant # 442. i missing + # 443. t ʔ missing * # 444. a vowel f = FileReader() f.data['consonants'] = f.parse_inventory("c, t(t, tⁿ)", 'consonant') f.data['vowels'] = f.parse_inventory("iˑ, a", 'vowel') # add known missings f.known_missings = [ MissingToken("i", known_missing=True), MissingToken("ʔ", known_missing=True), ] transcript = f.standardise("iˑcit ʔa") parsed = f.parse_transcript(transcript) assert parsed[0] == Token("iˑ") assert parsed[1] == Token("c") assert parsed[2] == MissingToken("i", known_missing=True) assert parsed[3] == Token("t(t, tⁿ)") assert parsed[4] == Token(" ") assert parsed[5] == MissingToken("ʔ", known_missing=True) assert parsed[6] == Token("a")
def test_missing_token(self): string = "ɔlaɣ" parsed = self.reader.parse_transcript(string) assert len(parsed) == len(string) assert parsed[0] == MissingToken("ɔ") assert parsed[1] == Token("l") assert parsed[2] == Token("a(a, ɑ, ə, æ)") assert parsed[3] == MissingToken("ɣ")
def test_slippage_and_duplication(self): string = "ɔlalɣ" parsed = self.reader.parse_transcript(string) assert len(parsed) == len(string) assert parsed[0] == MissingToken("ɔ") assert parsed[1] == Token("l") assert parsed[2] == Token("a(a, ɑ, ə, æ)") assert parsed[3] == Token("l") assert parsed[4] == MissingToken("ɣ")
def test_basaa_ignored_superscript_n(self): # gáː ⁿbɛ̀βí being parsed as # # 9. h consonant # 10. a(a, á, à, ǎ, â) vowel # 11. ŋ(ŋ, ŋ́, ŋ̀) consonant # 12. g missing * # 13. aː(aː, áː, àː, ǎː, âː) vowel # 14. ⁿ missing * # 15. b missing * # 16. ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂) vowel # 17. β consonant # 18. i(i, í, ì, ǐ, î) vowel # # i.e. 14 should be combined with 15 = ⁿb f = FileReader() f.data['consonants'] = f.parse_inventory("gʷ, ⁿb, ⁿg, β", 'consonant') f.data['vowels'] = f.parse_inventory( """ a(a, á, à, ǎ, â), aː(aː, áː, àː, ǎː, âː), e(e, é, è, ě, ê), ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂), i(i, í, ì, ǐ, î), """, 'vowels') transcript = f.standardise('gáː ⁿbɛ̀βí') parsed = f.parse_transcript(transcript) assert parsed[0] == MissingToken("g") # known missing assert parsed[1] == Token("aː(aː, áː, àː, ǎː, âː)") assert parsed[2] == Token(" ") # was incorrect -- should be SPACE. assert parsed[3] == Token("ⁿb") # was incorrect assert parsed[4] == Token("ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂)") assert parsed[5] == Token("β") assert parsed[6] == Token("i(i, í, ì, ǐ, î)")
def test_combining_in_others(self): # Setswana's 'bó̝kɔ̝̀ːnì' was being parsed as: # # 8. b consonant # 9. o̝(o̝, ò̝, ó̝, ô̝, ǒ̝) vowel # 10. k consonant # 11. ɔ̝̀ː missing * # 12. n consonant # 13. i(i, ì, í, î, ǐ, ì̞, í̞) vowel # # i.e. in token 11 the combining character of double triangle "ː" is # merged to the character 'ɔ̝̀'. 'ɔ̝̀' is IN the inventory, but 'ɔ̝̀ː' is NOT # so this gets flagged as an error. "ː" is in other symbols and is # currently not being recognized as such f = FileReader() f.data['consonants'] = f.parse_inventory("b, k, n", 'consonant') f.data['vowels'] = f.parse_inventory( "o̝(o̝, ò̝, ó̝, ô̝, ǒ̝), i(i, ì, í, î, ǐ, ì̞, í̞)", 'vowel') f.known_missings.update(f.parse_list(['/ɔ̝̀/'])) f.other_symbols.update(f.parse_inventory('ː', 'other')) # Other: ː transcript = 'bó̝kɔ̝̀ːnì' transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == Token("b"), parsed assert parsed[1] == Token("o̝(o̝, ò̝, ó̝, ô̝, ǒ̝)"), parsed assert parsed[2] == Token("k"), parsed assert parsed[3] == MissingToken("ɔ̝̀"), parsed assert parsed[4] == Token("ː"), parsed assert parsed[5] == Token("n"), parsed assert parsed[6] == Token("i(i, ì, í, î, ǐ, ì̞, í̞)"), parsed
def test_danish_overextension(self): # being parsed as ... MissingToken("də") not MissingToken("d"), # Token("ə") transcript = 'b̥lɛːsdə' f = FileReader() f.data['consonants'] = f.parse_inventory( "b̥(b̥, b̥ʰ), d̥(d̥, d̥s), s, l(l, l̩)", 'consonant') f.data['vowels'] = f.parse_inventory("e(e, eː), ɛ(ɛ, ɛː), a, ɑ, ə", 'vowel') transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == MissingToken("b̥(b̥, b̥ʰ)") assert parsed[1] == Token("l(l, l̩)") assert parsed[2] == Token("ɛ(ɛ, ɛː)") assert parsed[3] == Token("s") assert parsed[4] == MissingToken("d") assert parsed[5] == Token("ə")
def test_get_missing(self): # if missing char is in the known_missings, then it returns a MissingToken # with known_missing set to True x = self.reader.get_missing('x') assert x == MissingToken('x') assert x.known_missing == True # if missing char is in the default_tokens, then it returns a Token # with phoneme_type="default" dot = self.reader.get_missing(".") assert dot == Token('.') assert dot.is_missing == False assert dot.phoneme_type == 'default' # otherwise it's just Missing nine = self.reader.get_missing('9') assert nine == MissingToken('9') assert nine.known_missing == False
def test_identified_missing_characters(self): expected_missings = [ Token("oː"), Token("ɣ"), Token("ɔ"), Token("ç"), MissingToken('ɂ') ] for m in self.reader.errors: assert m in expected_missings, '%s is not in expected_missings' % m
def test_one(self): # parsed as #130. j(j, ɲ) consonant #131. o(o, ó, ò, ô, ŏ̀) vowel #132. ↗ other #133. óʔẁ̰ missing * #134. punctuation transcript = self.f.standardise("jó↗óʔẁ̰") parsed = self.f.parse_transcript(transcript) assert parsed[0] == Token("j(j, ɲ)") assert parsed[1] == Token("o(o, ó, ò, ô, ŏ̀)") assert parsed[2] == MissingToken("↗") assert parsed[3] == Token("o(o, ó, ò, ô, ŏ̀)") assert parsed[4] == Token("ʔw̰(ʔw̰, ʔŋ̰ʷ, ʔẁ̰)")
def test_maximal_error(self): # should identify the missing token as "o:" not ":" transcript = 'oːlal' f = FileReader() f.data['consonants'] = f.parse_inventory("l, ɭ, ʎ, r(r, ɾ, ɹ)", 'consonant') f.data['vowels'] = f.parse_inventory( "a(a, ɑ, ə, æ), o(o, ɒ), u(u, ʊ), uː", 'vowel') transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == MissingToken("oː") assert parsed[1] == Token("l") assert parsed[2] == Token("a(a, ɑ, ə, æ)") assert parsed[3] == Token("l")
def test_known_missing(self): assert MissingToken("X", known_missing=True).known_missing assert not MissingToken("X", known_missing=False).known_missing
def test_repr_known_missing(self): assert repr(MissingToken( "X", known_missing=True)) == "<KnownMissingToken: X>"
def test_repr(self): assert repr(MissingToken("X")) == "<MissingToken: X>"
def test_phoneme_type(self): assert MissingToken("X").phoneme_type == 'missing'
def test_is_missing(self): assert MissingToken("X").is_missing
def test_missing_token(self): MissingToken()
def test_other_symbols_not_in_missing(self): assert MissingToken("ɂ") not in self.reader.errors
def test_not_greedy(self): errors = self.reader.parse_list(["/x/ something/or other"]) assert errors == [MissingToken("x", known_missing=True)] assert errors[0].is_missing == True assert errors[0].known_missing == True
def test_simple_error(self): errors = self.reader.parse_list(["/x/ something"]) assert errors == [MissingToken("x", known_missing=True)] assert errors[0].is_missing == True assert errors[0].known_missing == True