Example #1
0
    def test_space_ends_word(self):
        # "cit ʔa" being parsed as
        # 440.    iˑ                      vowel
        # 441.    c                       consonant
        # 442.    i                       missing       +
        # 443.    t ʔ                     missing       *
        # 444.    a                       vowel

        f = FileReader()
        f.data['consonants'] = f.parse_inventory("c, t(t, tⁿ)", 'consonant')
        f.data['vowels'] = f.parse_inventory("iˑ, a", 'vowel')
        # add known missings
        f.known_missings = [
            MissingToken("i", known_missing=True),
            MissingToken("ʔ", known_missing=True),
        ]
        transcript = f.standardise("iˑcit ʔa")
        parsed = f.parse_transcript(transcript)
        assert parsed[0] == Token("iˑ")
        assert parsed[1] == Token("c")
        assert parsed[2] == MissingToken("i", known_missing=True)
        assert parsed[3] == Token("t(t, tⁿ)")
        assert parsed[4] == Token(" ")
        assert parsed[5] == MissingToken("ʔ", known_missing=True)
        assert parsed[6] == Token("a")
 def test_missing_token(self):
     string = "ɔlaɣ"
     parsed = self.reader.parse_transcript(string)
     assert len(parsed) == len(string)
     assert parsed[0] == MissingToken("ɔ")
     assert parsed[1] == Token("l")
     assert parsed[2] == Token("a(a, ɑ, ə, æ)")
     assert parsed[3] == MissingToken("ɣ")
 def test_slippage_and_duplication(self):
     string = "ɔlalɣ"
     parsed = self.reader.parse_transcript(string)
     assert len(parsed) == len(string)
     assert parsed[0] == MissingToken("ɔ")
     assert parsed[1] == Token("l")
     assert parsed[2] == Token("a(a, ɑ, ə, æ)")
     assert parsed[3] == Token("l")
     assert parsed[4] == MissingToken("ɣ")
Example #4
0
    def test_basaa_ignored_superscript_n(self):
        # gáː ⁿbɛ̀βí being parsed as
        #
        # 9.     h                       consonant
        # 10.    a(a, á, à, ǎ, â)    vowel
        # 11.    ŋ(ŋ, ŋ́, ŋ̀)            consonant
        # 12.    g                       missing       *
        # 13.    aː(aː, áː, àː, ǎː, âː)    vowel
        # 14.     ⁿ                      missing       *
        # 15.    b                       missing       *
        # 16.    ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂)    vowel
        # 17.    β                       consonant
        # 18.    i(i, í, ì, ǐ, î)    vowel
        #
        # i.e. 14 should be combined with 15 = ⁿb
        f = FileReader()
        f.data['consonants'] = f.parse_inventory("gʷ, ⁿb, ⁿg, β", 'consonant')
        f.data['vowels'] = f.parse_inventory(
            """
            a(a, á, à, ǎ, â), aː(aː, áː, àː, ǎː, âː),
            e(e, é, è, ě, ê), ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂),
            i(i, í, ì, ǐ, î),
            """, 'vowels')
        transcript = f.standardise('gáː ⁿbɛ̀βí')
        parsed = f.parse_transcript(transcript)

        assert parsed[0] == MissingToken("g")  # known missing
        assert parsed[1] == Token("aː(aː, áː, àː, ǎː, âː)")
        assert parsed[2] == Token(" ")  # was incorrect -- should be SPACE.
        assert parsed[3] == Token("ⁿb")  # was incorrect
        assert parsed[4] == Token("ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂)")
        assert parsed[5] == Token("β")
        assert parsed[6] == Token("i(i, í, ì, ǐ, î)")
Example #5
0
    def test_combining_in_others(self):
        # Setswana's 'bó̝kɔ̝̀ːnì' was being parsed as:
        #
        # 8.    b                                      consonant
        # 9.    o̝(o̝, ò̝, ó̝, ô̝, ǒ̝)                 vowel
        # 10.    k                                      consonant
        # 11.    ɔ̝̀ː                                   missing       *
        # 12.    n                                      consonant
        # 13.    i(i, ì, í, î, ǐ, ì̞, í̞)               vowel
        #
        # i.e. in token 11 the combining character of double triangle "ː" is
        # merged to the character 'ɔ̝̀'. 'ɔ̝̀' is IN the inventory, but 'ɔ̝̀ː' is NOT
        # so this gets flagged as an error. "ː" is in other symbols and is
        # currently not being recognized as such

        f = FileReader()
        f.data['consonants'] = f.parse_inventory("b, k, n", 'consonant')
        f.data['vowels'] = f.parse_inventory(
            "o̝(o̝, ò̝, ó̝, ô̝, ǒ̝), i(i, ì, í, î, ǐ, ì̞, í̞)", 'vowel')
        f.known_missings.update(f.parse_list(['/ɔ̝̀/']))
        f.other_symbols.update(f.parse_inventory('ː', 'other'))
        # Other: ː
        transcript = 'bó̝kɔ̝̀ːnì'
        transcript = f.standardise(transcript)
        parsed = f.parse_transcript(transcript)
        assert parsed[0] == Token("b"), parsed
        assert parsed[1] == Token("o̝(o̝, ò̝, ó̝, ô̝, ǒ̝)"), parsed
        assert parsed[2] == Token("k"), parsed
        assert parsed[3] == MissingToken("ɔ̝̀"), parsed
        assert parsed[4] == Token("ː"), parsed
        assert parsed[5] == Token("n"), parsed
        assert parsed[6] == Token("i(i, ì, í, î, ǐ, ì̞, í̞)"), parsed
Example #6
0
 def test_danish_overextension(self):
     # being parsed as  ... MissingToken("də") not MissingToken("d"),
     # Token("ə")
     transcript = 'b̥lɛːsdə'
     f = FileReader()
     f.data['consonants'] = f.parse_inventory(
         "b̥(b̥, b̥ʰ), d̥(d̥, d̥s), s, l(l, l̩)", 'consonant')
     f.data['vowels'] = f.parse_inventory("e(e, eː), ɛ(ɛ, ɛː), a, ɑ, ə",
                                          'vowel')
     transcript = f.standardise(transcript)
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == MissingToken("b̥(b̥, b̥ʰ)")
     assert parsed[1] == Token("l(l, l̩)")
     assert parsed[2] == Token("ɛ(ɛ, ɛː)")
     assert parsed[3] == Token("s")
     assert parsed[4] == MissingToken("d")
     assert parsed[5] == Token("ə")
    def test_get_missing(self):
        # if missing char is in the known_missings, then it returns a MissingToken
        # with known_missing set to True
        x = self.reader.get_missing('x')
        assert x == MissingToken('x')
        assert x.known_missing == True

        # if missing char is in the default_tokens, then it returns a Token
        # with phoneme_type="default"
        dot = self.reader.get_missing(".")
        assert dot == Token('.')
        assert dot.is_missing == False
        assert dot.phoneme_type == 'default'

        # otherwise it's just Missing
        nine = self.reader.get_missing('9')
        assert nine == MissingToken('9')
        assert nine.known_missing == False
 def test_identified_missing_characters(self):
     expected_missings = [
         Token("oː"),
         Token("ɣ"),
         Token("ɔ"),
         Token("ç"),
         MissingToken('ɂ')
     ]
     for m in self.reader.errors:
         assert m in expected_missings, '%s is not in expected_missings' % m
Example #9
0
 def test_one(self):
     # parsed as
     #130.	j(j, ɲ)                            	consonant
     #131.	o(o, ó, ò, ô, ŏ̀)                  	vowel
     #132.	↗                                  	other
     #133.	óʔẁ̰                               	missing   	*
     #134.	                                   	punctuation
     transcript = self.f.standardise("jó↗óʔẁ̰")
     parsed = self.f.parse_transcript(transcript)
     assert parsed[0] == Token("j(j, ɲ)")
     assert parsed[1] == Token("o(o, ó, ò, ô, ŏ̀)")
     assert parsed[2] == MissingToken("↗")
     assert parsed[3] == Token("o(o, ó, ò, ô, ŏ̀)")
     assert parsed[4] == Token("ʔw̰(ʔw̰, ʔŋ̰ʷ, ʔẁ̰)")
Example #10
0
    def test_maximal_error(self):
        # should identify the missing token as "o:" not ":"
        transcript = 'oːlal'
        f = FileReader()
        f.data['consonants'] = f.parse_inventory("l, ɭ, ʎ, r(r, ɾ, ɹ)",
                                                 'consonant')
        f.data['vowels'] = f.parse_inventory(
            "a(a, ɑ, ə, æ), o(o, ɒ), u(u, ʊ), uː", 'vowel')

        transcript = f.standardise(transcript)
        parsed = f.parse_transcript(transcript)
        assert parsed[0] == MissingToken("oː")
        assert parsed[1] == Token("l")
        assert parsed[2] == Token("a(a, ɑ, ə, æ)")
        assert parsed[3] == Token("l")
Example #11
0
 def test_known_missing(self):
     assert MissingToken("X", known_missing=True).known_missing
     assert not MissingToken("X", known_missing=False).known_missing
Example #12
0
 def test_repr_known_missing(self):
     assert repr(MissingToken(
         "X", known_missing=True)) == "<KnownMissingToken: X>"
Example #13
0
 def test_repr(self):
     assert repr(MissingToken("X")) == "<MissingToken: X>"
Example #14
0
 def test_phoneme_type(self):
     assert MissingToken("X").phoneme_type == 'missing'
Example #15
0
 def test_is_missing(self):
     assert MissingToken("X").is_missing
Example #16
0
 def test_missing_token(self):
     MissingToken()
 def test_other_symbols_not_in_missing(self):
     assert MissingToken("ɂ") not in self.reader.errors
 def test_not_greedy(self):
     errors = self.reader.parse_list(["/x/ something/or other"])
     assert errors == [MissingToken("x", known_missing=True)]
     assert errors[0].is_missing == True
     assert errors[0].known_missing == True
 def test_simple_error(self):
     errors = self.reader.parse_list(["/x/ something"])
     assert errors == [MissingToken("x", known_missing=True)]
     assert errors[0].is_missing == True
     assert errors[0].known_missing == True