def encode(self, text): text = self.preprocess(text, vowel_type=self.vowel_type) words = [] for word in TreebankWordTokenizer().tokenize(text): from reo_toolkit import is_maori if not is_maori(word): words.append(word) continue encoded_text = [] for syllable in self.tokenize(word): if not all(ch in alphabet for ch in syllable): encoded_text.append(syllable) continue if syllable in vowels: syllable = 'x' + syllable try: consonant, vowel = ''.join( [self.encoder_dict[ch] for ch in syllable]) except KeyError: logging.error( "KeyError: phoneme {} not in encoder_dict".format( syllable)) raise KeyError try: encoded = jamo.j2h(consonant, vowel) except jamo.InvalidJamoError: logging.error( 'InvalidJamoError - Consonant={} Vowel={} Syllable={}'. format(consonant, vowel, syllable)) encoded_text.append(encoded) words.append(''.join(encoded_text)) return TreebankWordDetokenizer().detokenize(words)
def tidy_text(text): paras = [] for para in text.split("\n"): sents = [] para = para.strip() for sent in sent_tokenize(para): tokens = word_tokenize(sent) words = sum(1 for token in tokens if re.search('[A-zāēīōū]', token, re.IGNORECASE)) nums = sum(1 for token in tokens if re.search('[0-9]', token)) if nums > words: logging.debug( 'Rejected this sentence due to too many numbers: {}'. format(sent)) continue if is_maori(sent) and re.search('[a-zāēīōū]', sent, re.IGNORECASE): sents.append(sent) paras.append(' '.join(sents)) text = '\n\n'.join(paras) return re.sub("\n{3,}", "\n\n", text)
def test_apostrophe(): assert is_maori( "Ko 'Mā whero, mā pango, ka oti te mahi' ētahi o ngā whakatauki rongonui" )
def test_camel_case(): assert is_maori("KeiTePai") assert not is_maori("MeToo", strict=False)
def test_many_vowels(): assert is_maori("Papaoiea")
def test_macron_combining_character(): """The unicode code point \u0304 is a combining character that adds a macron to the preceding letter""" assert is_maori('a\u0304'.encode('utf-8').decode())
def test_te_tiriti_o_waitangi(): with open('data/te-tiriti-o-waitangi.txt', 'r') as f: transcript = f.read() assert is_maori(transcript, strict=True)
def test_okina(): assert not is_maori(" ʻokina")
def test_māori_word(): assert is_maori('Ko matou ko nga Tino Rangatira o nga iwi o Nu Tireni')
def test_cleaning(): # This non-maori word gives a maori word 'i' after the non-maori characters are removed assert not is_maori("six")
def test_ambiguous_word(): assert not is_maori('a', strict=False) assert is_maori('a', strict=True)
def test_non_maori_letter(): assert not is_maori('z')
def test_ending_consonant(): assert not is_maori('new')
def test_double_consonant(): assert not is_maori('mmea')
def test_english_word(): assert not is_maori('James Cooks')
def test_pacific_island(): assert not is_maori("ma'unga")
def test_macron(): assert is_maori('tohutō')
def test_hyphen(): assert not is_maori('-maori')
def test_all_caps(): assert is_maori('WHĀTUA')
def test_long_hyphenated_word(): assert is_maori( 'Taumatawhakatangi-hangakoauauotamatea-turipukakapikimaunga-horonukupokaiwhenua-kitanatahu' )
def test_he_whakaputanga(): with open('data/he-whakaputanga.txt', 'r') as f: transcript = f.read() assert is_maori(transcript, strict=True)
def test_non_maori_word(): assert not is_maori('tongue', strict=False)
def test_triple_vowel(): assert not is_maori("teee")
def test_sentence(): assert is_maori( "inā tatū te tai ka puare tēnei toka ka taea te haere mai i reira ki uta", strict=True)