Beispiel #1
0
def extract_features(syllables: List[str], vocabulary: Iterable[str],
                     use_features=ALL_FEATURES) \
        -> List[Dict[str, float]]:
    features: List[Dict[str, float]] = []
    syl_length = len(syllables)
    ult_i = syl_length - 1
    penult_i = syl_length - 2
    antepenult_i = syl_length - 3

    cleaned_syllables = [clean_syllable(syllable) for syllable in syllables]
    syllable_types = [
        identify_syllable_type(syllable) for syllable in cleaned_syllables
    ]
    nuclei = [extract_vowels(syllable) for syllable in cleaned_syllables]
    codas = [extract_coda(syllable) for syllable in cleaned_syllables]

    for i in range(syl_length):
        # Use for-i so that adjacent syllables can be accessed if needed
        feature_dict = defaultdict(float)

        cleaned_syllable = cleaned_syllables[i]
        syl_type = syllable_types[i]
        vowels = nuclei[i]
        coda = codas[i]

        if VOCAB_FEATURE in use_features:
            # Add basic feature for each syllable in vocabulary
            if cleaned_syllable in vocabulary:
                feature_dict[cleaned_syllable] = 1.0
            else:
                feature_dict[UNK] = 1.0

        if SYLLABLE_TYPE_FEATURE in use_features:
            feature_dict['TYPE=' + syl_type] = 1.0

        if CODA_TYPE_FEATURE in use_features:
            coda_type = get_coda_type(syl_type)
            if coda_type:
                feature_dict['CODA_TYPE=' + coda_type] = 1.0
            else:
                feature_dict['NO_CODA'] = 1.0

        if ADJ_TYPE_FEATURE in use_features:
            if i != 0:
                pre_type = syllable_types[i - 1]
                feature_dict['PRE_TYPE=' + pre_type] = 1.0
            if i != ult_i:
                post_type = syllable_types[i + 1]
                feature_dict['POST_TYPE=' + post_type] = 1.0

        if ADJ_CODA_TYPE_FEATURE in use_features:
            if i != 0:
                pre_coda_type = get_coda_type(syllable_types[i - 1])
                if pre_coda_type:
                    feature_dict['PRE_CODA_TYPE=' + pre_coda_type] = 1.0
                else:
                    feature_dict['NO_PRE_CODA'] = 1.0
            if i != ult_i:
                post_coda_type = get_coda_type(syllable_types[i + 1])
                if post_coda_type:
                    feature_dict['POST_CODA_TYPE=' + post_coda_type] = 1.0
                else:
                    feature_dict['NO_POST_CODA'] = 1.0

        if VOWEL_FEATURE in use_features:
            if not (DIPHTHONG_FEATURE in use_features and len(vowels) > 1):
                # No need to mark diphthongs twice,
                # DIPHTHONG is a better feature
                feature_dict['VOWEL=' + vowels] = 1.0

        if ADJ_VOWEL_FEATURE in use_features:
            if i != 0:
                pre_vowels = nuclei[i - 1]
                if not (ADJ_DIPHTHONG_FEATURE in use_features
                        and len(pre_vowels) > 1):
                    feature_dict['PRE_VOWEL=' + pre_vowels] = 1.0
            if i != ult_i:
                post_vowels = nuclei[i + 1]
                if not (ADJ_DIPHTHONG_FEATURE in use_features
                        and len(post_vowels) > 1):
                    feature_dict['POST_VOWEL=' + post_vowels] = 1.0

        if DIPHTHONG_FEATURE in use_features:
            # All diphthongs are long
            if len(vowels) > 1:
                feature_dict['DIPHTHONG'] = 1.0

        if ADJ_DIPHTHONG_FEATURE in use_features:
            if i != 0:
                pre_vowels = nuclei[i - 1]
                if len(pre_vowels) > 1:
                    feature_dict['PRE_DIPHTHONG'] = 1.0
            if i != ult_i:
                post_vowels = nuclei[i + 1]
                if len(post_vowels) > 1:
                    feature_dict['POST_DIPHTHONG'] = 1.0

        if CODA_FEATURE in use_features:
            if coda:
                feature_dict['CODA=' + coda] = 1.0
            else:
                feature_dict['NO_CODA'] = 1.0

        if ADJ_CODA_FEATURE in use_features:
            if i != 0:
                pre_coda = codas[i - 1]
                if pre_coda:
                    feature_dict['PRE_CODA=' + pre_coda] = 1.0
                else:
                    feature_dict['NO_PRE_CODA'] = 1.0
            if i != ult_i:
                post_coda = codas[i + 1]
                if post_coda:
                    feature_dict['POST_CODA=' + post_coda] = 1.0
                else:
                    feature_dict['NO_POST_CODA'] = 1.0

        if RHYME_FEATURE in use_features:
            # TODO use vocabulary of rhymes or allow all?
            rhyme = vowels + coda
            feature_dict['RHYME=' + rhyme] = 1.0

        if ADJ_RHYME_FEATURE in use_features:
            if i != 0:
                pre_vowels = nuclei[i - 1]
                pre_coda = codas[i - 1]
                pre_rhyme = pre_vowels + pre_coda
                feature_dict['PRE_RHYME=' + pre_rhyme] = 1.0
            if i != ult_i:
                post_vowels = nuclei[i + 1]
                post_coda = codas[i + 1]
                post_rhyme = post_vowels + post_coda
                feature_dict['POST_RHYME=' + post_rhyme] = 1.0

        if VCC_FEATURE in use_features:
            # A vowel followed by two consonants (except muta cum liquida)
            # is always short, even if they're split across coda/onset
            # of this and the following syllable
            post_syl_type = syllable_types[i + 1] if i != ult_i else ''
            if syl_type.endswith('C+') or (syl_type.endswith('C')
                                           and post_syl_type.startswith('CV')):
                feature_dict['VCC'] = 1.0

        if POSTINIT_FEATURE in use_features:
            if i == 0:
                feature_dict['INIT'] = 1.0
            elif i == 1:
                feature_dict['POSTINIT'] = 1.0

        if ANTEPEN_FEATURE in use_features:
            # Ultimate, penultimate and antepenultimate
            if i == ult_i:
                feature_dict['ULT'] = 1.0
            elif i == penult_i:
                feature_dict['PENULT'] = 1.0
            elif i == antepenult_i:
                feature_dict['ANTEPENULT'] = 1.0

        if QUE_FEATURE in use_features and ANTEPEN_FEATURE in use_features:
            next_syl = cleaned_syllables[i + 1] if i != ult_i else None
            if next_syl == 'que':
                if i == ult_i - 1:
                    feature_dict['ULT+QUE'] = 1.0
                elif i == penult_i - 1:
                    feature_dict['PENULT+QUE'] = 1.0
                elif i == antepenult_i - 1:
                    feature_dict['ANTEPENULT+QUE'] = 1.0

        if EVEN_ODD_FEATURE in use_features:
            if i % 2 == 0:
                feature_dict['EVEN'] = 1.0
            else:
                feature_dict['ODD'] = 1.0

        features.append(feature_dict)

    return features
Beispiel #2
0
 def test_quvv(self):
     syllable = 'quae'
     vowels = extract_vowels(syllable)
     self.assertEqual('ae', vowels)
Beispiel #3
0
 def test_cvc(self):
     syllable = 'sum'
     vowels = extract_vowels(syllable)
     self.assertEqual('u', vowels)
Beispiel #4
0
 def test_cvvc(self):
     syllable = 'haec'
     vowels = extract_vowels(syllable)
     self.assertEqual('ae', vowels)
Beispiel #5
0
 def test_y(self):
     syllable = 'y'
     vowels = extract_vowels(syllable)
     self.assertEqual('y', vowels)