Exemple #1
0
 def test_syllabify_phonemes(self):
     vowels = ["a", "ɛ", "i", "ɔ", "ɒ", "ø", "u", "y", "œ", "e", "o", "j"]
     ipa_hierarchy = [vowels, ["r"], ["l"], ["m", "n"], ["f", "v", "θ", "ð", "s", "h"],
                      ["b", "d", "g", "k", "p", "t"]]
     syllabifier = Syllabifier()
     syllabifier.set_hierarchy(ipa_hierarchy)
     syllabifier.set_vowels(vowels)
     word = [ont.a, ont.s, ont.g, ont.a, ont.r, ont.dh, ont.r]
     syllabified_word = syllabifier.syllabify_phonemes(word)
     self.assertListEqual(syllabified_word, [[ont.a, ont.s], [ont.g, ont.a, ont.r, ont.dh, ont.r]])
Exemple #2
0
 def test_syllabify_phonemes(self):
     vowels = ["a", "ɛ", "i", "ɔ", "ɒ", "ø", "u", "y", "œ", "e", "o", "j"]
     ipa_hierarchy = [vowels, ["r"], ["l"], ["m", "n"], ["f", "v", "θ", "ð", "s", "h"],
                      ["b", "d", "g", "k", "p", "t"]]
     syllabifier = Syllabifier()
     syllabifier.set_hierarchy(ipa_hierarchy)
     syllabifier.set_vowels(vowels)
     word = [ont.a, ont.s, ont.g, ont.a, ont.r, ont.dh, ont.r]
     syllabified_word = syllabifier.syllabify_phonemes(word)
     self.assertListEqual(syllabified_word, [[ont.a, ont.s], [ont.g, ont.a, ont.r, ont.dh, ont.r]])
Exemple #3
0
 def syllabify(self, hierarchy):
     """
     Syllables may play a role in verse classification.
     """
     if len(self.long_lines) == 0:
         logger.error("No text was imported")
         self.syllabified_text = []
     else:
         syllabifier = Syllabifier(language="old_norse", break_geminants=True)
         syllabifier.set_hierarchy(hierarchy)
         syllabified_text = []
         for i, long_line in enumerate(self.long_lines):
             syllabified_text.append([])
             for short_line in long_line:
                 assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine)
                 short_line.syllabify(syllabifier)
                 syllabified_text[i].append(short_line.syllabified)
         self.syllabified_text = syllabified_text
Exemple #4
0
 def syllabify(self, hierarchy):
     """
     Syllables may play a role in verse classification.
     """
     if len(self.long_lines) == 0:
         logger.error("No text was imported")
         self.syllabified_text = []
     else:
         syllabifier = Syllabifier(language="old_norse", break_geminants=True)
         syllabifier.set_hierarchy(hierarchy)
         syllabified_text = []
         for i, long_line in enumerate(self.long_lines):
             syllabified_text.append([])
             for short_line in long_line:
                 assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine)
                 short_line.syllabify(syllabifier)
                 syllabified_text[i].append(short_line.syllabified)
         self.syllabified_text = syllabified_text
Exemple #5
0
    def syllabify(self, hierarchy: Dict[str, int]):
        """
        >>> stanza = "Ein sat hon úti,\\nþá er inn aldni kom\\nyggjungr ása\\nok í augu leit.\\nHvers fregnið mik?\\nHví freistið mín?\\nAllt veit ek, Óðinn,\\nhvar þú auga falt,\\ní inum mæra\\nMímisbrunni.\\nDrekkr mjöð Mímir\\nmorgun hverjan\\naf veði Valföðrs.\\nVituð ér enn - eða hvat?"
        >>> us = UnspecifiedStanza()
        >>> us.from_short_lines_text(stanza)
        >>> us.syllabify(old_norse_syllabifier.hierarchy)
        >>> us.syllabified_text
        [[['ein'], ['sat'], ['hon'], ['út', 'i']], [['þá'], ['er'], ['inn'], ['al', 'dni'], ['kom']], [['yg', 'gjungr'], ['ás', 'a']], [['ok'], ['í'], ['aug', 'u'], ['leit']], [['hvers'], ['freg', 'nið'], ['mik']], [['hví'], ['freis', 'tið'], ['mín']], [['allt'], ['veit'], ['ek'], ['ó', 'ðinn']], [['hvar'], ['þú'], ['aug', 'a'], ['falt']], [['í'], ['i', 'num'], ['mær', 'a']], [['mí', 'mis', 'brun', 'ni']], [['drekkr'], ['mjöð'], ['mí', 'mir']], [['mor', 'gun'], ['hver', 'jan']], [['af'], ['veð', 'i'], ['val', 'föðrs']], [['vi', 'tuð'], ['ér'], ['enn'], ['eð', 'a'], ['hvat']]]

        :param hierarchy: phonetic hierarchy
        :return:
        """
        syllabifier = Syllabifier(language="non", break_geminants=True)
        syllabifier.set_hierarchy(hierarchy)
        syllabified_text = []
        for short_line in self.short_lines:
            assert isinstance(short_line, ShortLine)
            short_line.syllabify(syllabifier)
            syllabified_text.append(short_line.syllabified)
        self.syllabified_text = syllabified_text
Exemple #6
0
    def syllabify(self, hierarchy):
        """
        >>> stanza = "Ein sat hon úti,\\nþá er inn aldni kom\\nyggjungr ása\\nok í augu leit.\\nHvers fregnið mik?\\nHví freistið mín?\\nAllt veit ek, Óðinn,\\nhvar þú auga falt,\\ní inum mæra\\nMímisbrunni.\\nDrekkr mjöð Mímir\\nmorgun hverjan\\naf veði Valföðrs.\\nVituð ér enn - eða hvat?"
        >>> us = UnspecifiedStanza()
        >>> us.from_short_lines_text(stanza)
        >>> us.syllabify(old_norse_syllabifier.hierarchy)
        >>> us.syllabified_text
        [[['ein'], ['sat'], ['hon'], ['út', 'i']], [['þá'], ['er'], ['inn'], ['al', 'dni'], ['kom']], [['yg', 'gjungr'], ['ás', 'a']], [['ok'], ['í'], ['aug', 'u'], ['leit']], [['hvers'], ['freg', 'nið'], ['mik']], [['hví'], ['freis', 'tið'], ['mín']], [['allt'], ['veit'], ['ek'], ['ó', 'ðinn']], [['hvar'], ['þú'], ['aug', 'a'], ['falt']], [['í'], ['i', 'num'], ['mær', 'a']], [['mí', 'mis', 'brun', 'ni']], [['drekkr'], ['mjöð'], ['mí', 'mir']], [['mor', 'gun'], ['hver', 'jan']], [['af'], ['veð', 'i'], ['val', 'föðrs']], [['vi', 'tuð'], ['ér'], ['enn'], ['eð', 'a'], ['hvat']]]

        :param hierarchy:
        :return:
        """
        syllabifier = Syllabifier(language="old_norse", break_geminants=True)
        syllabifier.set_hierarchy(hierarchy)
        syllabified_text = []
        for short_line in self.short_lines:
            assert isinstance(short_line, ShortLine)
            short_line.syllabify(syllabifier)
            syllabified_text.append(short_line.syllabified)
        self.syllabified_text = syllabified_text
Exemple #7
0
from cltk.phonology.syllabify import Syllabifier
from cltk.tokenize.word import WordTokenizer
from cltk.corpus.old_norse.syllabifier import hierarchy, invalid_onsets
from cltk.text_reuse.levenshtein import Levenshtein

from zoegas.constants import postags, dictionary_name, pos_verbose

# phonetic transcriber
phonetic_transcriber = phu.Transcriber(ont.DIPHTHONGS_IPA,
                                       ont.DIPHTHONGS_IPA_class, ont.IPA_class,
                                       ont.old_norse_rules)

# Old Norse syllabifier
s = Syllabifier(language="old_norse", break_geminants=True)
s.set_invalid_onsets(invalid_onsets)
s.set_hierarchy(hierarchy)

old_norse_word_tokenizer = WordTokenizer("old_norse")


def clean(text: str) -> Optional[str]:
    """

    :param text:
    :return:
    """
    if text is not None:
        text = re.sub(r"\t", "", text)
        text = re.sub(r"\n", "", text)
        return text
    else: