Beispiel #1
0
 def test_corpus_valid_syllables(self):
     corpus_words = read_corpus_words('../data/Ryan_Latin_master.txt')
     for word in corpus_words:
         syllables = syllabify(word)
         for syllable in syllables:
             try:
                 syl_type = identify_syllable_type(syllable)
                 self.assertIn(syl_type, SYLLABLE_TYPES)
             except ValueError as e:
                 print('Word: ' + word)
                 print('Syllables: ' + str(syllables))
                 raise e
def preprocess_words(path: str) -> List[List[str]]:
    words = read_corpus_words(path)
    word_count = len(words)
    print(f'Read in {word_count} Latin words')
    syllabified_words = [syllabify(word) for word in words]
    real_syllabified_words = []
    skipped = []
    for word in syllabified_words:
        try:
            for syllable in word:
                cleaned_syllable = clean_syllable(syllable)
                syl_type = identify_syllable_type(cleaned_syllable)
            real_syllabified_words.append(word)
        except ValueError:
            skipped.append(''.join(word))
    print(f'Processed {len(real_syllabified_words)} out of {word_count} words')
    print(f'Skipped the following words: {str(skipped)}')

    return real_syllabified_words
Beispiel #3
0
 def test_cplusvvc(self):
     syllable = 'staes'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('C+VVC', syl_type)
Beispiel #4
0
 def test_chlvvc(self):
     syllable = 'phraes'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CLVVC', syl_type)
Beispiel #5
0
 def test_clvc(self):
     syllable = 'plis'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CLVC', syl_type)
Beispiel #6
0
 def test_cvz(self):
     syllable = 'paz'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CVC+', syl_type)
Beispiel #7
0
 def test_iv(self):
     # Consonantal i, as in iacere
     syllable = 'ia'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CV', syl_type)
Beispiel #8
0
 def test_cplusvvcplus(self):
     syllable = 'scaest'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('C+VVC+', syl_type)
Beispiel #9
0
 def test_chlvv(self):
     syllable = 'chrae'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CLVV', syl_type)
Beispiel #10
0
 def test_cvv(self):
     syllable = 'cui'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CVV', syl_type)
Beispiel #11
0
 def test_c3v(self):
     syllable = 'stra'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('C+V', syl_type)
Beispiel #12
0
 def test_clv(self):
     syllable = 'pre'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CLV', syl_type)
Beispiel #13
0
 def test_quv(self):
     syllable = 'qui'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CLV', syl_type)
Beispiel #14
0
 def test_chv(self):
     syllable = 'rhe'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CV', syl_type)
Beispiel #15
0
 def test_clvcplus(self):
     syllable = 'clast'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CLVC+', syl_type)
Beispiel #16
0
 def test_chlvvcplus(self):
     syllable = 'phlaest'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CLVVC+', syl_type)
Beispiel #17
0
 def test_vx(self):
     syllable = 'ax'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('VC+', syl_type)
Beispiel #18
0
 def test_vvc(self):
     syllable = 'aes'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('VVC', syl_type)
Beispiel #19
0
 def test_cvc(self):
     syllable = 'non'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('CVC', syl_type)
Beispiel #20
0
def extract_features(syllables: List[str], vocabulary: Iterable[str],
                     use_features=ALL_FEATURES) \
        -> List[Dict[str, float]]:
    features: List[Dict[str, float]] = []
    syl_length = len(syllables)
    ult_i = syl_length - 1
    penult_i = syl_length - 2
    antepenult_i = syl_length - 3

    cleaned_syllables = [clean_syllable(syllable) for syllable in syllables]
    syllable_types = [
        identify_syllable_type(syllable) for syllable in cleaned_syllables
    ]
    nuclei = [extract_vowels(syllable) for syllable in cleaned_syllables]
    codas = [extract_coda(syllable) for syllable in cleaned_syllables]

    for i in range(syl_length):
        # Use for-i so that adjacent syllables can be accessed if needed
        feature_dict = defaultdict(float)

        cleaned_syllable = cleaned_syllables[i]
        syl_type = syllable_types[i]
        vowels = nuclei[i]
        coda = codas[i]

        if VOCAB_FEATURE in use_features:
            # Add basic feature for each syllable in vocabulary
            if cleaned_syllable in vocabulary:
                feature_dict[cleaned_syllable] = 1.0
            else:
                feature_dict[UNK] = 1.0

        if SYLLABLE_TYPE_FEATURE in use_features:
            feature_dict['TYPE=' + syl_type] = 1.0

        if CODA_TYPE_FEATURE in use_features:
            coda_type = get_coda_type(syl_type)
            if coda_type:
                feature_dict['CODA_TYPE=' + coda_type] = 1.0
            else:
                feature_dict['NO_CODA'] = 1.0

        if ADJ_TYPE_FEATURE in use_features:
            if i != 0:
                pre_type = syllable_types[i - 1]
                feature_dict['PRE_TYPE=' + pre_type] = 1.0
            if i != ult_i:
                post_type = syllable_types[i + 1]
                feature_dict['POST_TYPE=' + post_type] = 1.0

        if ADJ_CODA_TYPE_FEATURE in use_features:
            if i != 0:
                pre_coda_type = get_coda_type(syllable_types[i - 1])
                if pre_coda_type:
                    feature_dict['PRE_CODA_TYPE=' + pre_coda_type] = 1.0
                else:
                    feature_dict['NO_PRE_CODA'] = 1.0
            if i != ult_i:
                post_coda_type = get_coda_type(syllable_types[i + 1])
                if post_coda_type:
                    feature_dict['POST_CODA_TYPE=' + post_coda_type] = 1.0
                else:
                    feature_dict['NO_POST_CODA'] = 1.0

        if VOWEL_FEATURE in use_features:
            if not (DIPHTHONG_FEATURE in use_features and len(vowels) > 1):
                # No need to mark diphthongs twice,
                # DIPHTHONG is a better feature
                feature_dict['VOWEL=' + vowels] = 1.0

        if ADJ_VOWEL_FEATURE in use_features:
            if i != 0:
                pre_vowels = nuclei[i - 1]
                if not (ADJ_DIPHTHONG_FEATURE in use_features
                        and len(pre_vowels) > 1):
                    feature_dict['PRE_VOWEL=' + pre_vowels] = 1.0
            if i != ult_i:
                post_vowels = nuclei[i + 1]
                if not (ADJ_DIPHTHONG_FEATURE in use_features
                        and len(post_vowels) > 1):
                    feature_dict['POST_VOWEL=' + post_vowels] = 1.0

        if DIPHTHONG_FEATURE in use_features:
            # All diphthongs are long
            if len(vowels) > 1:
                feature_dict['DIPHTHONG'] = 1.0

        if ADJ_DIPHTHONG_FEATURE in use_features:
            if i != 0:
                pre_vowels = nuclei[i - 1]
                if len(pre_vowels) > 1:
                    feature_dict['PRE_DIPHTHONG'] = 1.0
            if i != ult_i:
                post_vowels = nuclei[i + 1]
                if len(post_vowels) > 1:
                    feature_dict['POST_DIPHTHONG'] = 1.0

        if CODA_FEATURE in use_features:
            if coda:
                feature_dict['CODA=' + coda] = 1.0
            else:
                feature_dict['NO_CODA'] = 1.0

        if ADJ_CODA_FEATURE in use_features:
            if i != 0:
                pre_coda = codas[i - 1]
                if pre_coda:
                    feature_dict['PRE_CODA=' + pre_coda] = 1.0
                else:
                    feature_dict['NO_PRE_CODA'] = 1.0
            if i != ult_i:
                post_coda = codas[i + 1]
                if post_coda:
                    feature_dict['POST_CODA=' + post_coda] = 1.0
                else:
                    feature_dict['NO_POST_CODA'] = 1.0

        if RHYME_FEATURE in use_features:
            # TODO use vocabulary of rhymes or allow all?
            rhyme = vowels + coda
            feature_dict['RHYME=' + rhyme] = 1.0

        if ADJ_RHYME_FEATURE in use_features:
            if i != 0:
                pre_vowels = nuclei[i - 1]
                pre_coda = codas[i - 1]
                pre_rhyme = pre_vowels + pre_coda
                feature_dict['PRE_RHYME=' + pre_rhyme] = 1.0
            if i != ult_i:
                post_vowels = nuclei[i + 1]
                post_coda = codas[i + 1]
                post_rhyme = post_vowels + post_coda
                feature_dict['POST_RHYME=' + post_rhyme] = 1.0

        if VCC_FEATURE in use_features:
            # A vowel followed by two consonants (except muta cum liquida)
            # is always short, even if they're split across coda/onset
            # of this and the following syllable
            post_syl_type = syllable_types[i + 1] if i != ult_i else ''
            if syl_type.endswith('C+') or (syl_type.endswith('C')
                                           and post_syl_type.startswith('CV')):
                feature_dict['VCC'] = 1.0

        if POSTINIT_FEATURE in use_features:
            if i == 0:
                feature_dict['INIT'] = 1.0
            elif i == 1:
                feature_dict['POSTINIT'] = 1.0

        if ANTEPEN_FEATURE in use_features:
            # Ultimate, penultimate and antepenultimate
            if i == ult_i:
                feature_dict['ULT'] = 1.0
            elif i == penult_i:
                feature_dict['PENULT'] = 1.0
            elif i == antepenult_i:
                feature_dict['ANTEPENULT'] = 1.0

        if QUE_FEATURE in use_features and ANTEPEN_FEATURE in use_features:
            next_syl = cleaned_syllables[i + 1] if i != ult_i else None
            if next_syl == 'que':
                if i == ult_i - 1:
                    feature_dict['ULT+QUE'] = 1.0
                elif i == penult_i - 1:
                    feature_dict['PENULT+QUE'] = 1.0
                elif i == antepenult_i - 1:
                    feature_dict['ANTEPENULT+QUE'] = 1.0

        if EVEN_ODD_FEATURE in use_features:
            if i % 2 == 0:
                feature_dict['EVEN'] = 1.0
            else:
                feature_dict['ODD'] = 1.0

        features.append(feature_dict)

    return features
Beispiel #21
0
 def test_i(self):
     # Make sure not consonantal i
     syllable = 'i'
     syl_type = identify_syllable_type(syllable)
     self.assertEqual('V', syl_type)