def test_corpus_valid_syllables(self): corpus_words = read_corpus_words('../data/Ryan_Latin_master.txt') for word in corpus_words: syllables = syllabify(word) for syllable in syllables: try: syl_type = identify_syllable_type(syllable) self.assertIn(syl_type, SYLLABLE_TYPES) except ValueError as e: print('Word: ' + word) print('Syllables: ' + str(syllables)) raise e
def preprocess_words(path: str) -> List[List[str]]: words = read_corpus_words(path) word_count = len(words) print(f'Read in {word_count} Latin words') syllabified_words = [syllabify(word) for word in words] real_syllabified_words = [] skipped = [] for word in syllabified_words: try: for syllable in word: cleaned_syllable = clean_syllable(syllable) syl_type = identify_syllable_type(cleaned_syllable) real_syllabified_words.append(word) except ValueError: skipped.append(''.join(word)) print(f'Processed {len(real_syllabified_words)} out of {word_count} words') print(f'Skipped the following words: {str(skipped)}') return real_syllabified_words
def test_cplusvvc(self): syllable = 'staes' syl_type = identify_syllable_type(syllable) self.assertEqual('C+VVC', syl_type)
def test_chlvvc(self): syllable = 'phraes' syl_type = identify_syllable_type(syllable) self.assertEqual('CLVVC', syl_type)
def test_clvc(self): syllable = 'plis' syl_type = identify_syllable_type(syllable) self.assertEqual('CLVC', syl_type)
def test_cvz(self): syllable = 'paz' syl_type = identify_syllable_type(syllable) self.assertEqual('CVC+', syl_type)
def test_iv(self): # Consonantal i, as in iacere syllable = 'ia' syl_type = identify_syllable_type(syllable) self.assertEqual('CV', syl_type)
def test_cplusvvcplus(self): syllable = 'scaest' syl_type = identify_syllable_type(syllable) self.assertEqual('C+VVC+', syl_type)
def test_chlvv(self): syllable = 'chrae' syl_type = identify_syllable_type(syllable) self.assertEqual('CLVV', syl_type)
def test_cvv(self): syllable = 'cui' syl_type = identify_syllable_type(syllable) self.assertEqual('CVV', syl_type)
def test_c3v(self): syllable = 'stra' syl_type = identify_syllable_type(syllable) self.assertEqual('C+V', syl_type)
def test_clv(self): syllable = 'pre' syl_type = identify_syllable_type(syllable) self.assertEqual('CLV', syl_type)
def test_quv(self): syllable = 'qui' syl_type = identify_syllable_type(syllable) self.assertEqual('CLV', syl_type)
def test_chv(self): syllable = 'rhe' syl_type = identify_syllable_type(syllable) self.assertEqual('CV', syl_type)
def test_clvcplus(self): syllable = 'clast' syl_type = identify_syllable_type(syllable) self.assertEqual('CLVC+', syl_type)
def test_chlvvcplus(self): syllable = 'phlaest' syl_type = identify_syllable_type(syllable) self.assertEqual('CLVVC+', syl_type)
def test_vx(self): syllable = 'ax' syl_type = identify_syllable_type(syllable) self.assertEqual('VC+', syl_type)
def test_vvc(self): syllable = 'aes' syl_type = identify_syllable_type(syllable) self.assertEqual('VVC', syl_type)
def test_cvc(self): syllable = 'non' syl_type = identify_syllable_type(syllable) self.assertEqual('CVC', syl_type)
def extract_features(syllables: List[str], vocabulary: Iterable[str], use_features=ALL_FEATURES) \ -> List[Dict[str, float]]: features: List[Dict[str, float]] = [] syl_length = len(syllables) ult_i = syl_length - 1 penult_i = syl_length - 2 antepenult_i = syl_length - 3 cleaned_syllables = [clean_syllable(syllable) for syllable in syllables] syllable_types = [ identify_syllable_type(syllable) for syllable in cleaned_syllables ] nuclei = [extract_vowels(syllable) for syllable in cleaned_syllables] codas = [extract_coda(syllable) for syllable in cleaned_syllables] for i in range(syl_length): # Use for-i so that adjacent syllables can be accessed if needed feature_dict = defaultdict(float) cleaned_syllable = cleaned_syllables[i] syl_type = syllable_types[i] vowels = nuclei[i] coda = codas[i] if VOCAB_FEATURE in use_features: # Add basic feature for each syllable in vocabulary if cleaned_syllable in vocabulary: feature_dict[cleaned_syllable] = 1.0 else: feature_dict[UNK] = 1.0 if SYLLABLE_TYPE_FEATURE in use_features: feature_dict['TYPE=' + syl_type] = 1.0 if CODA_TYPE_FEATURE in use_features: coda_type = get_coda_type(syl_type) if coda_type: feature_dict['CODA_TYPE=' + coda_type] = 1.0 else: feature_dict['NO_CODA'] = 1.0 if ADJ_TYPE_FEATURE in use_features: if i != 0: pre_type = syllable_types[i - 1] feature_dict['PRE_TYPE=' + pre_type] = 1.0 if i != ult_i: post_type = syllable_types[i + 1] feature_dict['POST_TYPE=' + post_type] = 1.0 if ADJ_CODA_TYPE_FEATURE in use_features: if i != 0: pre_coda_type = get_coda_type(syllable_types[i - 1]) if pre_coda_type: feature_dict['PRE_CODA_TYPE=' + pre_coda_type] = 1.0 else: feature_dict['NO_PRE_CODA'] = 1.0 if i != ult_i: post_coda_type = get_coda_type(syllable_types[i + 1]) if post_coda_type: feature_dict['POST_CODA_TYPE=' + post_coda_type] = 1.0 else: feature_dict['NO_POST_CODA'] = 1.0 if VOWEL_FEATURE in use_features: if not (DIPHTHONG_FEATURE in use_features and len(vowels) > 1): # No need to mark diphthongs twice, # DIPHTHONG is a better feature feature_dict['VOWEL=' + vowels] = 1.0 if ADJ_VOWEL_FEATURE in use_features: if i != 0: pre_vowels = nuclei[i - 1] if not (ADJ_DIPHTHONG_FEATURE in use_features and len(pre_vowels) > 1): feature_dict['PRE_VOWEL=' + pre_vowels] = 1.0 if i != ult_i: post_vowels = nuclei[i + 1] if not (ADJ_DIPHTHONG_FEATURE in use_features and len(post_vowels) > 1): feature_dict['POST_VOWEL=' + post_vowels] = 1.0 if DIPHTHONG_FEATURE in use_features: # All diphthongs are long if len(vowels) > 1: feature_dict['DIPHTHONG'] = 1.0 if ADJ_DIPHTHONG_FEATURE in use_features: if i != 0: pre_vowels = nuclei[i - 1] if len(pre_vowels) > 1: feature_dict['PRE_DIPHTHONG'] = 1.0 if i != ult_i: post_vowels = nuclei[i + 1] if len(post_vowels) > 1: feature_dict['POST_DIPHTHONG'] = 1.0 if CODA_FEATURE in use_features: if coda: feature_dict['CODA=' + coda] = 1.0 else: feature_dict['NO_CODA'] = 1.0 if ADJ_CODA_FEATURE in use_features: if i != 0: pre_coda = codas[i - 1] if pre_coda: feature_dict['PRE_CODA=' + pre_coda] = 1.0 else: feature_dict['NO_PRE_CODA'] = 1.0 if i != ult_i: post_coda = codas[i + 1] if post_coda: feature_dict['POST_CODA=' + post_coda] = 1.0 else: feature_dict['NO_POST_CODA'] = 1.0 if RHYME_FEATURE in use_features: # TODO use vocabulary of rhymes or allow all? rhyme = vowels + coda feature_dict['RHYME=' + rhyme] = 1.0 if ADJ_RHYME_FEATURE in use_features: if i != 0: pre_vowels = nuclei[i - 1] pre_coda = codas[i - 1] pre_rhyme = pre_vowels + pre_coda feature_dict['PRE_RHYME=' + pre_rhyme] = 1.0 if i != ult_i: post_vowels = nuclei[i + 1] post_coda = codas[i + 1] post_rhyme = post_vowels + post_coda feature_dict['POST_RHYME=' + post_rhyme] = 1.0 if VCC_FEATURE in use_features: # A vowel followed by two consonants (except muta cum liquida) # is always short, even if they're split across coda/onset # of this and the following syllable post_syl_type = syllable_types[i + 1] if i != ult_i else '' if syl_type.endswith('C+') or (syl_type.endswith('C') and post_syl_type.startswith('CV')): feature_dict['VCC'] = 1.0 if POSTINIT_FEATURE in use_features: if i == 0: feature_dict['INIT'] = 1.0 elif i == 1: feature_dict['POSTINIT'] = 1.0 if ANTEPEN_FEATURE in use_features: # Ultimate, penultimate and antepenultimate if i == ult_i: feature_dict['ULT'] = 1.0 elif i == penult_i: feature_dict['PENULT'] = 1.0 elif i == antepenult_i: feature_dict['ANTEPENULT'] = 1.0 if QUE_FEATURE in use_features and ANTEPEN_FEATURE in use_features: next_syl = cleaned_syllables[i + 1] if i != ult_i else None if next_syl == 'que': if i == ult_i - 1: feature_dict['ULT+QUE'] = 1.0 elif i == penult_i - 1: feature_dict['PENULT+QUE'] = 1.0 elif i == antepenult_i - 1: feature_dict['ANTEPENULT+QUE'] = 1.0 if EVEN_ODD_FEATURE in use_features: if i % 2 == 0: feature_dict['EVEN'] = 1.0 else: feature_dict['ODD'] = 1.0 features.append(feature_dict) return features
def test_i(self): # Make sure not consonantal i syllable = 'i' syl_type = identify_syllable_type(syllable) self.assertEqual('V', syl_type)