コード例 #1
0
ファイル: test_stem.py プロジェクト: manu-chroma/cltk
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
コード例 #2
0
ファイル: test_stem.py プロジェクト: stenskjaer/cltk
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
コード例 #3
0
ファイル: test_stem.py プロジェクト: vierth/cltk
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
     # tests for macronized words
     macronized_word = 'audītū'
     macronized_syllables = syllabifier.syllabify(macronized_word)
     macronized_target = ['au', 'dī', 'tū']
     self.assertEqual(macronized_syllables, macronized_target)
     macronized_word2 = 'conjiciō'
     macronized_syllables2 = syllabifier.syllabify(macronized_word2)
     macronized_target2 = ['con', 'ji', 'ci', 'ō']
     self.assertEqual(macronized_syllables2, macronized_target2)
     macronized_word3 = 'ā'
     macronized_syllables3 = syllabifier.syllabify(macronized_word3)
     macronized_target3 = ['ā']
     self.assertEqual(macronized_syllables3, macronized_target3)
コード例 #4
0
ファイル: test_stem.py プロジェクト: TylerKirby/cltk
 def test_latin_syllabifier(self):
     """Test Latin syllabifier."""
     word = 'sidere'
     syllabifier = Syllabifier()
     syllables = syllabifier.syllabify(word)
     target = ['si', 'de', 're']
     self.assertEqual(syllables, target)
     # tests for macronized words
     macronized_word = 'audītū'
     macronized_syllables = syllabifier.syllabify(macronized_word)
     macronized_target = ['au', 'dī', 'tū']
     self.assertEqual(macronized_syllables, macronized_target)
     macronized_word2 = 'conjiciō'
     macronized_syllables2 = syllabifier.syllabify(macronized_word2)
     macronized_target2 = ['con', 'ji', 'ci', 'ō']
     self.assertEqual(macronized_syllables2, macronized_target2)
     macronized_word3 = 'ā'
     macronized_syllables3 = syllabifier.syllabify(macronized_word3)
     macronized_target3 = ['ā']
     self.assertEqual(macronized_syllables3, macronized_target3)
コード例 #5
0
    def third_declension(self):

        from cltk.stem.latin.syllabifier import Syllabifier

        syllabifier = Syllabifier()

        vowels = ['a', 'e', 'i', 'o', 'u', 'ā', 'ē', 'ī', 'ō', 'ū']

        base = DeclineNoun.id_declension(self)[1]

        forms = [self.nom, self.gen]

        endings = [
            'ī', 'em', 'e', 'blah', 'ēs', 'um', 'ibus', 'ēs', 'ibus', 'ēs'
        ]

        for i in endings:
            forms.append(base + i)

        forms[5] = self.nom

        nom_syllable = len(syllabifier.syllabify(self.nom))
        gen_syllable = len(syllabifier.syllabify(self.gen))

        i_stem = False

        if nom_syllable == gen_syllable:
            if self.nom[-2:] in ['is', 'es']:
                i_stem = True
        elif self.nom[-1] in ['x', 's']:
            if base[-1] not in vowels and base[-2] not in vowels:
                i_stem = True

        if i_stem == True:

            forms[7] = forms[7][:-2] + 'i' + forms[7][-2:]

#        if int(self.gender) == 3:
        if self.gender == 'neutrum':
            forms[5] = self.nom
            forms[3] = self.nom
            forms[6] = base + 'a'
            forms[9] = base + 'a'
            forms[11] = base + 'a'
            if self.nom[-1] == 'e' or self.nom[-2:] in ['al', 'ar']:
                forms[4] = base + 'ī'
                forms[6] = base + 'ia'
                forms[7] = base + 'ium'
                forms[9] = base + 'ia'
                forms[11] = base + 'ia'
        return forms
コード例 #6
0
class ChantSyllabifier(metaclass=Singleton):
    def __init__(self):
        latin = deepcopy(LATIN)
        exceptions = self.get_exceptions()
        latin['exceptions'] = exceptions
        latin['diphthongs'] = ["ae", "au", "oe"]  # Not: eu, ei
        latin['mute_consonants_and_f'].append('h')
        self.syllabifier = Syllabifier(latin)

    def get_exceptions(self):
        # See notebook "Identify syllabification errors" for background:
        # We checked the most frequent under/oversegmentations, and
        # manually corrected those

        undersegmented = {
            'euouae': ['e', 'u', 'o', 'u', 'a', 'e'],  # 29.43%
            'quia': ['qui', 'a'],  # 13.91%
            'seuouae': ['se', 'u', 'o', 'u', 'a', 'e'],  # 6.65%
            'israel': ['is', 'ra', 'el'],  # 2.64%
            'cui': ['cu', 'i'],  # 1.74%
            'michael': ['mic', 'ha', 'el'],  # 0.84%
            #'qui': ['qui'],                                   # 0.50%
            'requiem': ['re', 'qui', 'em'],  # 0.41%
            'huic': ['hu', 'ic'],  # 0.41%
            #'jerusalem': ['je', 'ru', 'sa', 'lem'],           # 0.38%
            # 'alleluia': ['al', 'le', 'lu', 'ia'],             # 0.27%
            #'noe': ['noe'],                                   # 0.22%
            'requiescet': ['re', 'qui', 'es', 'cet'],  # 0.21%
            'exiit': ['ex', 'i', 'it'],  # 0.17%
            'exierunt': ['ex', 'i', 'e', 'runt'],  # 0.13%
            'eloquium': ['e', 'lo', 'qui', 'um'],  # 0.12%
            'exiet': ['ex', 'i', 'et'],  # 0.12%
            # 'gelboe': ['gel', 'boe'],                         # 0.11%
            'ierit': ['i', 'e', 'rit'],  # 0.10%
            'christi': ['chris', 'ti'],  # 0.10%
            'saul': ['sa', 'ul'],  # 0.09%
            'colloquiis': ['col', 'lo', 'qui', 'is'],  # 0.09%
            'israelita': ['is', 'ra', 'e', 'li', 'ta'],  # 0.09%
            'michaele': ['mic', 'ha', 'e', 'le'],  # 0.08%
            'requiescit': ['re', 'qui', 'es', 'cit'],  # 0.08%
            'obsequia': ['ob', 'se', 'qui', 'a'],  # 0.07%
            # 'jesus': ['je', 'sus'],                           # 0.07%
            'nicolaum': ['ni', 'co', 'laum'],  # 0.06%
            'requies': ['re', 'qui', 'es'],  # 0.06%
            'requiescunt': ['re', 'qui', 'es', 'c**t'],  # 0.06%
            'exierit': ['ex', 'i', 'e', 'rit'],  # 0.06%
            'michaelis': ['mic', 'ha', 'e', 'lis'],  # 0.05%
            'requiescent': ['re', 'qui', 'es', 'cent'],  # 0.05%
        }

        # Recurring issues are "guen" and "quu"
        oversegmented = {
            'sanguine': ['san', 'gui', 'ne'],  # 1.45%
            'sanguinem': ['san', 'gui', 'nem'],  # 1.43%
            'lingua': ['lin', 'gua'],  # 1.11%
            'alleluya': ['al', 'le', 'lu', 'ya'],  # 0.88%
            'sanguis': ['san', 'guis'],  # 0.83%
            'est*': ['est*'],  # 0.64%
            #'eleison': ['e', 'le', 'i', 'son'],               # 0.59%
            'linguis': ['lin', 'guis'],  # 0.59%
            'linguae': ['lin', 'guae'],  # 0.47%
            'sequuntur': ['se', 'quun', 'tur'],  # 0.42%
            'sanguinis': ['san', 'gui', 'nis'],  # 0.40%
            #'euge': ['e', 'u', 'ge'],                         # 0.29%
            'eleemosynam': ['e', 'lee', 'mo', 'sy', 'nam'],  # 0.27%
            'iniquum': ['in', 'i', 'quum'],  # 0.23%
            'sunt*': ['sunt*'],  # 0.23%
            'unguenti': ['un', 'guen', 'ti'],  # 0.21%
            'persequuntur': ['per', 'se', 'quun', 'tur'],  # 0.20%
            'unguentum': ['un', 'guen', 'tum'],  # 0.20%
            'unguentorum': ['un', 'guen', 'to', 'rum'],  # 0.16%
            'urbs': ['urbs'],  # 0.16%
            'equuleo': ['e', 'quu', 'le', 'o'],  # 0.15%
            #'perpetuum': ['per', 'pe', 'tu', 'um'],           # 0.14%
            #'antiquus': ['an', 'ti', 'qu', 'us'],             # 0.14%
            'sanguinibus': ['san', 'gui', 'ni', 'bus'],  # 0.13%
            'eleemosyna': ['e', 'lee', 'mo', 'sy', 'na'],  # 0.13%
            'linguam': ['lin', 'guam'],  # 0.13%
            'stirps': ['stirps'],  # 0.11%
            #'ait': ['a', 'it'],                               # 0.11%
            'languores': ['lan', 'guo', 'res'],  # 0.11%
            #'jerusalem': ['je', 'ru', 'sa', 'lem'],           # 0.10%
            'loquuntur': ['lo', 'quun', 'tur'],  # 0.09%
            # 'tuum': ['tu', 'um'],                             # 0.09%
            # 'ideoque': ['i', 'de', 'o', 'que'],               # 0.09%
            'annuntiaverunt*': ['an', 'nun', 'ti', 'a', 've',
                                'runt*'],  # 0.09%
            'linguarum': ['lin', 'gua', 'rum'],  # 0.09%
            'in*': ['in*'],  # 0.09%
            'unguento': ['un', 'guen', 'to'],  # 0.09%
            'urguentes': ['ur', 'guen', 'tes'],  # 0.09%
            'langueo': ['lan', 'gue', 'o'],  # 0.08%
            'sanguinum': ['san', 'gui', 'num'],  # 0.08%
            'ihesum': ['ihe', 'sum'],  # 0.08%
            'languoribus': ['lan', 'guo', 'ri', 'bus'],  # 0.07%
            'probaverunt': ['pro', 'ba', 've', 'runt'],  # 0.07%
            'faciam': ['fa', 'ci', 'am'],  # 0.07%
            #'equum': ['e', 'qu', 'um'],                       # 0.07%
            #'jerusalem*': ['je', 'ru', 'sa', 'lem*'],         # 0.07%
            'moyses': ['moy', 'ses'],  # 0.07%
            'pinguedine': ['pin', 'gue', 'di', 'ne'],  # 0.07%
            'linguas': ['lin', 'guas'],  # 0.06%
            #'erue': ['e', 'ru', 'e'],                         # 0.06%
            'galaaditim': ['ga', 'laa', 'di', 'tim'],  # 0.06%
            'languentium': ['lan', 'guen', 'ti', 'um'],  # 0.05%
            'mansuetudinem': ['man', 'sue', 'tu', 'di', 'nem'],  # 0.05%
            #'iniquus': ['in', 'i', 'quus'],               # 0.05%
            #'filiis': ['fi', 'li', 'is'],                     # 0.05%
            'gloria*': ['glo', 'ri', 'a*'],  # 0.05%
            'leyson': ['ley', 'son'],  # 0.05%
            'moysi': ['moy', 'si'],  # 0.05%
            #'suavitatis': ['su', 'a', 'vi', 'ta', 'tis'],     # 0.05%
            'accipite': ['ac', 'ci', 'pi', 'te'],  # 0.05%
            'exsurgens*': ['ex', 'sur', 'gens*'],  # 0.05%
        }

        js_cantus_exceptions = {
            # Exceptions from the alignment algorithm used on the
            # Cantus website
            #'euouae': ['e', 'u', 'o', 'u', 'a', 'e'],
            #'seuouae': ['se', 'u', 'o', 'u', 'a', 'e'],
            #'alleluya': ['al', 'le', 'lu', 'ya'],
            'hierusalem': ['hie', 'ru', 'sa', 'lem'],
            'hiesum': ['hie', 'sum'],
            'kyrieleison': ['ky', 'ri', 'e', 'lei', 'son'],
            'xpisteleison': ['xpi', 'ste', 'lei', 'son'],
            'eleison': ['e', 'lei', 'son'],
        }

        exceptions = dict(LATIN['exceptions'], **undersegmented,
                          **oversegmented, **js_cantus_exceptions)
        return exceptions

    def syllabify(self, text):
        """
        Syllabifies the (lowercased) text

        Lowercased since since otherwise CLTK doesn't work well
        """
        return self.syllabifier.syllabify(text.lower())
コード例 #7
0
ファイル: chant.py プロジェクト: eyequeue/chant
def _recalculate():

    from cltk.stem.latin.syllabifier import Syllabifier
    syllabifier = Syllabifier()

    corpus = lmloCorpus()

    # populate chant data frame

    translate_subcorpus = dict()
    translate_subcorpus['Feast'] = 'LF'
    translate_subcorpus['Saint'] = 'LS'
    translate_subcorpus['Humbert'] = 'H'
    translate_subcorpus['Humbert Sanct.'] = 'HS'
    translate_subcorpus['Humbert Temp.'] = 'HT'

    _data = defaultdict(list)
    for i, c in enumerate(corpus.chants):
        _data['chantID'].append(i)
        _data['corpus'].append('L')
        _subcorpus = c.office.split(']')[0][1:]
        # if _subcorpus == 'Saint':
        #     _subcorpus = 'Sanctorale'
        # if _subcorpus == 'Humber':
        #     _subcorpus = 'Humbert'
        # if _subcorpus == 'Feast':
        #     _subcorpus = 'Feast'
        _data['subcorpus'].append(translate_subcorpus[_subcorpus])
        _data['Modus'].append(c.mode)
        _data['modus'].append(c.mode.lower())
        if c.mode[0] in ['1','2']:
            _data['maneria'].append('protus')
        elif c.mode[0] in ['3','4']:
            _data['maneria'].append('deuterus')
        elif c.mode[0] in ['5','6']:
            _data['maneria'].append('tritus')
        elif c.mode[0] in ['7','8']:
            _data['maneria'].append('tetrardus')
        else:
            _data['maneria'].append('unknown')
        if c.mode[1] == c.mode[1].upper():
            _data['ambitus'].append('excessive')
        elif c.mode[0] in ['1','3','5','7']:
            _data['ambitus'].append('authentic')
        elif c.mode[0] in ['2','4','6','8']:
            _data['ambitus'].append('plagal')
        else:
            _data['ambitus'].append('unknown')


        _data['office'].append(' '.join(c.office.split()[1:]))

        # switching the names Service/service and Genre/genre from lmlo module
        # for consistency with Modus/modus: capital is the more granular grouping

        _data['Service'].append(c.service)
        _data['service'].append(c.Service)
        _data['ordinal'].append(c.index)
        _data['Genre'].append(c.genre)
        _data['genre'].append(c.Genre)
        _data['text'].append(c.fulltext)
        # _data['lmloHeader'].append(c.header)
        # _data['lmloEncoding'].append(c.lmloEncoding)
        _data['volpiano'].append(c.volpiano)

    chants = pd.DataFrame(_data)
    chants.to_pickle('chantData.zip', protocol=4)

    # populate note data frame

    # first some utils we'll use in the loop below

    def pindex(sd):
        return (int(sd[0])*7 + int(sd[1]))

    def intclass(interval):
        interval = abs(interval)
        if interval == 0:
            return 'rep'
        elif interval == 1:
            return 'step'
        elif interval == 2:
            return 'slip'
        else:
            return 'leap'


    _data = defaultdict(list)
    for i_c, c in enumerate(corpus.chants):
        i = 1
        for i_w, w in enumerate(c.words):
            for i_s, s in enumerate(w.syllables):
                for i_n, n in enumerate(s.notes):

                    # identify note's location in the corpus

                    _data['chantID'].append(i_c)
                    _data['word'].append(i_w)
                    _data['syll'].append(i_s)
                    _data['note'].append(i_n)


                    # identify initial and final syllable (1) and word (2) boundaries

                    initial = 0
                    if i_n == 0:
                        initial += 1
                    if i_s == 0:
                        initial *= 2
                    final = 0
                    if i_n == len(s.notes) - 1:
                        final += 1
                    if i_s == len(w.syllables) - 1:
                        final *= 2
                    _data['boundary_before'].append(initial)
                    _data['boundary_after'].append(final)

                    # extract pitch and register features

                    _data['reg_abs'].append(n.letter[0])
                    _data['pc_abs'].append(n.letter[1])
                    _data['pitch_abs'].append( n.letter[0] + '.' + n.letter[1])
                    _data['reg_rel'].append(n.sd[0])
                    _data['pc_rel'].append(n.sd[1])
                    _data['pitch_rel'].append( n.sd[0] + '.' + n.sd[1])
                    

                    # calculate intervallic context


                    if i == 1:
                        _data['lint'].append(99)
                        _data['lint_class'].append('edge')
                        _data['lint_dir'].append('edge')
                    else:
                        interval = int(pindex(c.flatSD[i]) - pindex(c.flatSD[i-1]))
                        _data['lint'].append(interval)
                        _data['lint_class'].append(intclass(interval))
                        if interval > 0:
                            _data['lint_dir'].append('up')
                        elif interval < 0:
                            _data['lint_dir'].append('down')
                        else:
                            _data['lint_dir'].append('rep')

                    if i == len(c.flatSD)-2:
                        _data['rint_class'].append('edge')
                        _data['rint_dir'].append('edge')
                        _data['rint'].append(99)
                    else:
                        interval = int(pindex(c.flatSD[i+1]) - pindex(c.flatSD[i]))
                        _data['rint'].append(interval)
                        _data['rint_class'].append(intclass(interval))
                        if interval > 0:
                            _data['rint_dir'].append('up')
                        elif interval < 0:
                            _data['rint_dir'].append('down')
                        else:
                            _data['rint_dir'].append('rep')
                        

                    i += 1

    # add interval info



    notes = pd.DataFrame(_data)
    modekey = chants.merge(notes).query("word == 0 and syll == 0 and note == 0").set_index('chantID').modus.to_frame()
    notes = notes.join(modekey.modus, on='chantID', how='inner')
    notes.to_pickle('noteData.zip', protocol=4)
    
    syllables = defaultdict(list)
    override = dict()
    override['eius'] = ['e','ius']
    override['dei'] = ['de','i']
    override['deus'] = ['de','us']
    override['quia'] = ['qui','a']
    override['christi'] = ['chris','ti']
    override['christe'] = ['chris','te']
    override['eum'] = ['e','um']
    override['deum'] = ['de','um']
    override['meum'] = ['me','um']
    override['meus'] = ['me','us']
    override['christo'] = ['chris','to']
    override['christus'] = ['chris','tus']
    override['christum'] = ['chris','tum']
    override['mei'] = ['me','i']
    override['ei'] = ['e','i']
    override['cui'] = ['cu','i']
    override['israel'] = ['is','ra','el']
    override['sanguine'] = ['san','gui','ne']
    override['meis'] = ['me','is']
    override['eis'] = ['e','is']
    override['fidei'] = ['fi','de','i']
    override['sanguinem'] = ['san','gui','nem']
    override['lingua'] = ['lin','gua']
    override['thronum'] = ['thro','num']
    override['pulchra'] = ['pul','chra']
    override['oleum'] = ['o','le','um']
    override['adiutor'] = ['ad','iu','tor']
    override['sanguis'] = ['san','guis']
    override['sanguinis'] = ['san','gui','nis']
    override['huic'] = ['hu','ic']
    override['alleluia'] = ['al','le','lu','ia']
    override['michael'] = ['mi','cha','el']
    override['noe'] = ['no','e']
    
    
    for i, c in chants.iterrows():
      if c.modus not in basicModes+['6c']:
          continue
      # if i>200: break
      words = c.text.lower().split()
      vwords = c.volpiano[4:-3].split('--')
      if len(words) != len(vwords):
        # print(f'oops: {len(words)} {len(vwords)}')
        # print(words)
        # print(vwords)    
        vwords[-2] = vwords[-2] + '-' + vwords[-1]
        vwords.pop(-1)
        # print('--fixing--')
        # print(words)
        # print(vwords)
      for j in range(len(words)):
        if words[j] in override:
          sylls = override[words[j]]
        else:
          sylls = syllabifier.syllabify(words[j].lower())
        vsylls = vwords[j].split('-')
        if len(sylls) != len(vsylls):
          sylls = [f'[{words[j]}]'] * len(vsylls)
        for k in range(len(vsylls)):
          syllables['chantID'].append(c.chantID)
          syllables['syllable'].append(sylls[k])
          syllables['last_syll'].append(k+1 == len(vsylls))
          v = vsylls[k]
          syllables['n_notes'].append(len(v))
          syllables['volpiano'].append(v)
          notes = ''
          for vchar in v:
            notes += f'{gamut_pitches[gamut_volpiano.index(vchar)]} '
          syllables['notes'].append(notes)
          syllables['pitch_initial'].append(gamut_pitches[gamut_volpiano.index(v[0])])
          syllables['pitch_final'].append(gamut_pitches[gamut_volpiano.index(v[-1])])
          syllables['t_type'].append(v2r(v))
          syllables['e_type'].append(v2r(v[0]+v[-1]))
          syllables['c_type'].append(v2c(v))
   

    syllables = pd.DataFrame(syllables)
    syllables = syllables.join(modekey.modus, on='chantID', how='inner')
    syllables['extrema'] = syllables['pitch_initial'] + '-' + syllables['pitch_final']
    syllables.to_pickle('syllableData.zip', protocol=4)


    ngrams = defaultdict(list)
    
    
    for i, c in chants.iterrows():
        # if i>0: break
        v = c.volpiano.replace('-','')
    
        # V = v with duplicate pitches removed
    
        V = v[0]
        for k in range(1, len(v)):
            if v[k] != V[-1]:
                V += v[k]
    
        
        for n in range(1, n_limit+1):
            for k in range(1,len(V)-n):
                v = V[k:k+n]
                ngrams['chantID'].append(c.chantID)
                ngrams['pos'].append(k)
                ngrams['n_notes'].append(len(v))
                ngrams['volpiano'].append(v)
                notes = ''
                for vchar in v:
                    notes += f'{gamut_pitches[gamut_volpiano.index(vchar)]} '
                ngrams['notes'].append(notes)
                ngrams['pitch_initial'].append(gamut_pitches[gamut_volpiano.index(v[0])])            
                ngrams['pitch_final'].append(gamut_pitches[gamut_volpiano.index(v[-1])])
                ngrams['t_type'].append(v2r(v))
                ngrams['e_type'].append(v2r(v[0]+v[-1]))
                ngrams['c_type'].append(v2c(v))
    
    ngrams = pd.DataFrame(ngrams)
    ngrams = ngrams.join(modekey.modus, on='chantID', how='inner')
    
    ngrams['extrema'] = ngrams['pitch_initial'] + '-' + ngrams['pitch_final']
    
    print('making pickles')
    ngrams.to_pickle('ngramData.zip', protocol=4)