def test_latin_syllabifier(self): """Test Latin syllabifier.""" word = 'sidere' syllabifier = Syllabifier() syllables = syllabifier.syllabify(word) target = ['si', 'de', 're'] self.assertEqual(syllables, target)
def test_latin_syllabifier(self): """Test Latin syllabifier.""" word = 'sidere' syllabifier = Syllabifier() syllables = syllabifier.syllabify(word) target = ['si', 'de', 're'] self.assertEqual(syllables, target) # tests for macronized words macronized_word = 'audītū' macronized_syllables = syllabifier.syllabify(macronized_word) macronized_target = ['au', 'dī', 'tū'] self.assertEqual(macronized_syllables, macronized_target) macronized_word2 = 'conjiciō' macronized_syllables2 = syllabifier.syllabify(macronized_word2) macronized_target2 = ['con', 'ji', 'ci', 'ō'] self.assertEqual(macronized_syllables2, macronized_target2) macronized_word3 = 'ā' macronized_syllables3 = syllabifier.syllabify(macronized_word3) macronized_target3 = ['ā'] self.assertEqual(macronized_syllables3, macronized_target3)
def third_declension(self): from cltk.stem.latin.syllabifier import Syllabifier syllabifier = Syllabifier() vowels = ['a', 'e', 'i', 'o', 'u', 'ā', 'ē', 'ī', 'ō', 'ū'] base = DeclineNoun.id_declension(self)[1] forms = [self.nom, self.gen] endings = [ 'ī', 'em', 'e', 'blah', 'ēs', 'um', 'ibus', 'ēs', 'ibus', 'ēs' ] for i in endings: forms.append(base + i) forms[5] = self.nom nom_syllable = len(syllabifier.syllabify(self.nom)) gen_syllable = len(syllabifier.syllabify(self.gen)) i_stem = False if nom_syllable == gen_syllable: if self.nom[-2:] in ['is', 'es']: i_stem = True elif self.nom[-1] in ['x', 's']: if base[-1] not in vowels and base[-2] not in vowels: i_stem = True if i_stem == True: forms[7] = forms[7][:-2] + 'i' + forms[7][-2:] # if int(self.gender) == 3: if self.gender == 'neutrum': forms[5] = self.nom forms[3] = self.nom forms[6] = base + 'a' forms[9] = base + 'a' forms[11] = base + 'a' if self.nom[-1] == 'e' or self.nom[-2:] in ['al', 'ar']: forms[4] = base + 'ī' forms[6] = base + 'ia' forms[7] = base + 'ium' forms[9] = base + 'ia' forms[11] = base + 'ia' return forms
class ChantSyllabifier(metaclass=Singleton): def __init__(self): latin = deepcopy(LATIN) exceptions = self.get_exceptions() latin['exceptions'] = exceptions latin['diphthongs'] = ["ae", "au", "oe"] # Not: eu, ei latin['mute_consonants_and_f'].append('h') self.syllabifier = Syllabifier(latin) def get_exceptions(self): # See notebook "Identify syllabification errors" for background: # We checked the most frequent under/oversegmentations, and # manually corrected those undersegmented = { 'euouae': ['e', 'u', 'o', 'u', 'a', 'e'], # 29.43% 'quia': ['qui', 'a'], # 13.91% 'seuouae': ['se', 'u', 'o', 'u', 'a', 'e'], # 6.65% 'israel': ['is', 'ra', 'el'], # 2.64% 'cui': ['cu', 'i'], # 1.74% 'michael': ['mic', 'ha', 'el'], # 0.84% #'qui': ['qui'], # 0.50% 'requiem': ['re', 'qui', 'em'], # 0.41% 'huic': ['hu', 'ic'], # 0.41% #'jerusalem': ['je', 'ru', 'sa', 'lem'], # 0.38% # 'alleluia': ['al', 'le', 'lu', 'ia'], # 0.27% #'noe': ['noe'], # 0.22% 'requiescet': ['re', 'qui', 'es', 'cet'], # 0.21% 'exiit': ['ex', 'i', 'it'], # 0.17% 'exierunt': ['ex', 'i', 'e', 'runt'], # 0.13% 'eloquium': ['e', 'lo', 'qui', 'um'], # 0.12% 'exiet': ['ex', 'i', 'et'], # 0.12% # 'gelboe': ['gel', 'boe'], # 0.11% 'ierit': ['i', 'e', 'rit'], # 0.10% 'christi': ['chris', 'ti'], # 0.10% 'saul': ['sa', 'ul'], # 0.09% 'colloquiis': ['col', 'lo', 'qui', 'is'], # 0.09% 'israelita': ['is', 'ra', 'e', 'li', 'ta'], # 0.09% 'michaele': ['mic', 'ha', 'e', 'le'], # 0.08% 'requiescit': ['re', 'qui', 'es', 'cit'], # 0.08% 'obsequia': ['ob', 'se', 'qui', 'a'], # 0.07% # 'jesus': ['je', 'sus'], # 0.07% 'nicolaum': ['ni', 'co', 'laum'], # 0.06% 'requies': ['re', 'qui', 'es'], # 0.06% 'requiescunt': ['re', 'qui', 'es', 'c**t'], # 0.06% 'exierit': ['ex', 'i', 'e', 'rit'], # 0.06% 'michaelis': ['mic', 'ha', 'e', 'lis'], # 0.05% 'requiescent': ['re', 'qui', 'es', 'cent'], # 0.05% } # Recurring issues are "guen" and "quu" oversegmented = { 'sanguine': ['san', 'gui', 'ne'], # 1.45% 'sanguinem': ['san', 'gui', 'nem'], # 1.43% 'lingua': ['lin', 'gua'], # 1.11% 'alleluya': ['al', 'le', 'lu', 'ya'], # 0.88% 'sanguis': ['san', 'guis'], # 0.83% 'est*': ['est*'], # 0.64% #'eleison': ['e', 'le', 'i', 'son'], # 0.59% 'linguis': ['lin', 'guis'], # 0.59% 'linguae': ['lin', 'guae'], # 0.47% 'sequuntur': ['se', 'quun', 'tur'], # 0.42% 'sanguinis': ['san', 'gui', 'nis'], # 0.40% #'euge': ['e', 'u', 'ge'], # 0.29% 'eleemosynam': ['e', 'lee', 'mo', 'sy', 'nam'], # 0.27% 'iniquum': ['in', 'i', 'quum'], # 0.23% 'sunt*': ['sunt*'], # 0.23% 'unguenti': ['un', 'guen', 'ti'], # 0.21% 'persequuntur': ['per', 'se', 'quun', 'tur'], # 0.20% 'unguentum': ['un', 'guen', 'tum'], # 0.20% 'unguentorum': ['un', 'guen', 'to', 'rum'], # 0.16% 'urbs': ['urbs'], # 0.16% 'equuleo': ['e', 'quu', 'le', 'o'], # 0.15% #'perpetuum': ['per', 'pe', 'tu', 'um'], # 0.14% #'antiquus': ['an', 'ti', 'qu', 'us'], # 0.14% 'sanguinibus': ['san', 'gui', 'ni', 'bus'], # 0.13% 'eleemosyna': ['e', 'lee', 'mo', 'sy', 'na'], # 0.13% 'linguam': ['lin', 'guam'], # 0.13% 'stirps': ['stirps'], # 0.11% #'ait': ['a', 'it'], # 0.11% 'languores': ['lan', 'guo', 'res'], # 0.11% #'jerusalem': ['je', 'ru', 'sa', 'lem'], # 0.10% 'loquuntur': ['lo', 'quun', 'tur'], # 0.09% # 'tuum': ['tu', 'um'], # 0.09% # 'ideoque': ['i', 'de', 'o', 'que'], # 0.09% 'annuntiaverunt*': ['an', 'nun', 'ti', 'a', 've', 'runt*'], # 0.09% 'linguarum': ['lin', 'gua', 'rum'], # 0.09% 'in*': ['in*'], # 0.09% 'unguento': ['un', 'guen', 'to'], # 0.09% 'urguentes': ['ur', 'guen', 'tes'], # 0.09% 'langueo': ['lan', 'gue', 'o'], # 0.08% 'sanguinum': ['san', 'gui', 'num'], # 0.08% 'ihesum': ['ihe', 'sum'], # 0.08% 'languoribus': ['lan', 'guo', 'ri', 'bus'], # 0.07% 'probaverunt': ['pro', 'ba', 've', 'runt'], # 0.07% 'faciam': ['fa', 'ci', 'am'], # 0.07% #'equum': ['e', 'qu', 'um'], # 0.07% #'jerusalem*': ['je', 'ru', 'sa', 'lem*'], # 0.07% 'moyses': ['moy', 'ses'], # 0.07% 'pinguedine': ['pin', 'gue', 'di', 'ne'], # 0.07% 'linguas': ['lin', 'guas'], # 0.06% #'erue': ['e', 'ru', 'e'], # 0.06% 'galaaditim': ['ga', 'laa', 'di', 'tim'], # 0.06% 'languentium': ['lan', 'guen', 'ti', 'um'], # 0.05% 'mansuetudinem': ['man', 'sue', 'tu', 'di', 'nem'], # 0.05% #'iniquus': ['in', 'i', 'quus'], # 0.05% #'filiis': ['fi', 'li', 'is'], # 0.05% 'gloria*': ['glo', 'ri', 'a*'], # 0.05% 'leyson': ['ley', 'son'], # 0.05% 'moysi': ['moy', 'si'], # 0.05% #'suavitatis': ['su', 'a', 'vi', 'ta', 'tis'], # 0.05% 'accipite': ['ac', 'ci', 'pi', 'te'], # 0.05% 'exsurgens*': ['ex', 'sur', 'gens*'], # 0.05% } js_cantus_exceptions = { # Exceptions from the alignment algorithm used on the # Cantus website #'euouae': ['e', 'u', 'o', 'u', 'a', 'e'], #'seuouae': ['se', 'u', 'o', 'u', 'a', 'e'], #'alleluya': ['al', 'le', 'lu', 'ya'], 'hierusalem': ['hie', 'ru', 'sa', 'lem'], 'hiesum': ['hie', 'sum'], 'kyrieleison': ['ky', 'ri', 'e', 'lei', 'son'], 'xpisteleison': ['xpi', 'ste', 'lei', 'son'], 'eleison': ['e', 'lei', 'son'], } exceptions = dict(LATIN['exceptions'], **undersegmented, **oversegmented, **js_cantus_exceptions) return exceptions def syllabify(self, text): """ Syllabifies the (lowercased) text Lowercased since since otherwise CLTK doesn't work well """ return self.syllabifier.syllabify(text.lower())
def _recalculate(): from cltk.stem.latin.syllabifier import Syllabifier syllabifier = Syllabifier() corpus = lmloCorpus() # populate chant data frame translate_subcorpus = dict() translate_subcorpus['Feast'] = 'LF' translate_subcorpus['Saint'] = 'LS' translate_subcorpus['Humbert'] = 'H' translate_subcorpus['Humbert Sanct.'] = 'HS' translate_subcorpus['Humbert Temp.'] = 'HT' _data = defaultdict(list) for i, c in enumerate(corpus.chants): _data['chantID'].append(i) _data['corpus'].append('L') _subcorpus = c.office.split(']')[0][1:] # if _subcorpus == 'Saint': # _subcorpus = 'Sanctorale' # if _subcorpus == 'Humber': # _subcorpus = 'Humbert' # if _subcorpus == 'Feast': # _subcorpus = 'Feast' _data['subcorpus'].append(translate_subcorpus[_subcorpus]) _data['Modus'].append(c.mode) _data['modus'].append(c.mode.lower()) if c.mode[0] in ['1','2']: _data['maneria'].append('protus') elif c.mode[0] in ['3','4']: _data['maneria'].append('deuterus') elif c.mode[0] in ['5','6']: _data['maneria'].append('tritus') elif c.mode[0] in ['7','8']: _data['maneria'].append('tetrardus') else: _data['maneria'].append('unknown') if c.mode[1] == c.mode[1].upper(): _data['ambitus'].append('excessive') elif c.mode[0] in ['1','3','5','7']: _data['ambitus'].append('authentic') elif c.mode[0] in ['2','4','6','8']: _data['ambitus'].append('plagal') else: _data['ambitus'].append('unknown') _data['office'].append(' '.join(c.office.split()[1:])) # switching the names Service/service and Genre/genre from lmlo module # for consistency with Modus/modus: capital is the more granular grouping _data['Service'].append(c.service) _data['service'].append(c.Service) _data['ordinal'].append(c.index) _data['Genre'].append(c.genre) _data['genre'].append(c.Genre) _data['text'].append(c.fulltext) # _data['lmloHeader'].append(c.header) # _data['lmloEncoding'].append(c.lmloEncoding) _data['volpiano'].append(c.volpiano) chants = pd.DataFrame(_data) chants.to_pickle('chantData.zip', protocol=4) # populate note data frame # first some utils we'll use in the loop below def pindex(sd): return (int(sd[0])*7 + int(sd[1])) def intclass(interval): interval = abs(interval) if interval == 0: return 'rep' elif interval == 1: return 'step' elif interval == 2: return 'slip' else: return 'leap' _data = defaultdict(list) for i_c, c in enumerate(corpus.chants): i = 1 for i_w, w in enumerate(c.words): for i_s, s in enumerate(w.syllables): for i_n, n in enumerate(s.notes): # identify note's location in the corpus _data['chantID'].append(i_c) _data['word'].append(i_w) _data['syll'].append(i_s) _data['note'].append(i_n) # identify initial and final syllable (1) and word (2) boundaries initial = 0 if i_n == 0: initial += 1 if i_s == 0: initial *= 2 final = 0 if i_n == len(s.notes) - 1: final += 1 if i_s == len(w.syllables) - 1: final *= 2 _data['boundary_before'].append(initial) _data['boundary_after'].append(final) # extract pitch and register features _data['reg_abs'].append(n.letter[0]) _data['pc_abs'].append(n.letter[1]) _data['pitch_abs'].append( n.letter[0] + '.' + n.letter[1]) _data['reg_rel'].append(n.sd[0]) _data['pc_rel'].append(n.sd[1]) _data['pitch_rel'].append( n.sd[0] + '.' + n.sd[1]) # calculate intervallic context if i == 1: _data['lint'].append(99) _data['lint_class'].append('edge') _data['lint_dir'].append('edge') else: interval = int(pindex(c.flatSD[i]) - pindex(c.flatSD[i-1])) _data['lint'].append(interval) _data['lint_class'].append(intclass(interval)) if interval > 0: _data['lint_dir'].append('up') elif interval < 0: _data['lint_dir'].append('down') else: _data['lint_dir'].append('rep') if i == len(c.flatSD)-2: _data['rint_class'].append('edge') _data['rint_dir'].append('edge') _data['rint'].append(99) else: interval = int(pindex(c.flatSD[i+1]) - pindex(c.flatSD[i])) _data['rint'].append(interval) _data['rint_class'].append(intclass(interval)) if interval > 0: _data['rint_dir'].append('up') elif interval < 0: _data['rint_dir'].append('down') else: _data['rint_dir'].append('rep') i += 1 # add interval info notes = pd.DataFrame(_data) modekey = chants.merge(notes).query("word == 0 and syll == 0 and note == 0").set_index('chantID').modus.to_frame() notes = notes.join(modekey.modus, on='chantID', how='inner') notes.to_pickle('noteData.zip', protocol=4) syllables = defaultdict(list) override = dict() override['eius'] = ['e','ius'] override['dei'] = ['de','i'] override['deus'] = ['de','us'] override['quia'] = ['qui','a'] override['christi'] = ['chris','ti'] override['christe'] = ['chris','te'] override['eum'] = ['e','um'] override['deum'] = ['de','um'] override['meum'] = ['me','um'] override['meus'] = ['me','us'] override['christo'] = ['chris','to'] override['christus'] = ['chris','tus'] override['christum'] = ['chris','tum'] override['mei'] = ['me','i'] override['ei'] = ['e','i'] override['cui'] = ['cu','i'] override['israel'] = ['is','ra','el'] override['sanguine'] = ['san','gui','ne'] override['meis'] = ['me','is'] override['eis'] = ['e','is'] override['fidei'] = ['fi','de','i'] override['sanguinem'] = ['san','gui','nem'] override['lingua'] = ['lin','gua'] override['thronum'] = ['thro','num'] override['pulchra'] = ['pul','chra'] override['oleum'] = ['o','le','um'] override['adiutor'] = ['ad','iu','tor'] override['sanguis'] = ['san','guis'] override['sanguinis'] = ['san','gui','nis'] override['huic'] = ['hu','ic'] override['alleluia'] = ['al','le','lu','ia'] override['michael'] = ['mi','cha','el'] override['noe'] = ['no','e'] for i, c in chants.iterrows(): if c.modus not in basicModes+['6c']: continue # if i>200: break words = c.text.lower().split() vwords = c.volpiano[4:-3].split('--') if len(words) != len(vwords): # print(f'oops: {len(words)} {len(vwords)}') # print(words) # print(vwords) vwords[-2] = vwords[-2] + '-' + vwords[-1] vwords.pop(-1) # print('--fixing--') # print(words) # print(vwords) for j in range(len(words)): if words[j] in override: sylls = override[words[j]] else: sylls = syllabifier.syllabify(words[j].lower()) vsylls = vwords[j].split('-') if len(sylls) != len(vsylls): sylls = [f'[{words[j]}]'] * len(vsylls) for k in range(len(vsylls)): syllables['chantID'].append(c.chantID) syllables['syllable'].append(sylls[k]) syllables['last_syll'].append(k+1 == len(vsylls)) v = vsylls[k] syllables['n_notes'].append(len(v)) syllables['volpiano'].append(v) notes = '' for vchar in v: notes += f'{gamut_pitches[gamut_volpiano.index(vchar)]} ' syllables['notes'].append(notes) syllables['pitch_initial'].append(gamut_pitches[gamut_volpiano.index(v[0])]) syllables['pitch_final'].append(gamut_pitches[gamut_volpiano.index(v[-1])]) syllables['t_type'].append(v2r(v)) syllables['e_type'].append(v2r(v[0]+v[-1])) syllables['c_type'].append(v2c(v)) syllables = pd.DataFrame(syllables) syllables = syllables.join(modekey.modus, on='chantID', how='inner') syllables['extrema'] = syllables['pitch_initial'] + '-' + syllables['pitch_final'] syllables.to_pickle('syllableData.zip', protocol=4) ngrams = defaultdict(list) for i, c in chants.iterrows(): # if i>0: break v = c.volpiano.replace('-','') # V = v with duplicate pitches removed V = v[0] for k in range(1, len(v)): if v[k] != V[-1]: V += v[k] for n in range(1, n_limit+1): for k in range(1,len(V)-n): v = V[k:k+n] ngrams['chantID'].append(c.chantID) ngrams['pos'].append(k) ngrams['n_notes'].append(len(v)) ngrams['volpiano'].append(v) notes = '' for vchar in v: notes += f'{gamut_pitches[gamut_volpiano.index(vchar)]} ' ngrams['notes'].append(notes) ngrams['pitch_initial'].append(gamut_pitches[gamut_volpiano.index(v[0])]) ngrams['pitch_final'].append(gamut_pitches[gamut_volpiano.index(v[-1])]) ngrams['t_type'].append(v2r(v)) ngrams['e_type'].append(v2r(v[0]+v[-1])) ngrams['c_type'].append(v2c(v)) ngrams = pd.DataFrame(ngrams) ngrams = ngrams.join(modekey.modus, on='chantID', how='inner') ngrams['extrema'] = ngrams['pitch_initial'] + '-' + ngrams['pitch_final'] print('making pickles') ngrams.to_pickle('ngramData.zip', protocol=4)