Example #1
0
def syllabify_cmu(cmu_token):
	import syllabify as sy
	cmu_token=cmu_token.replace(' 2','2').replace(' 1','1') # fix prim/sec stress markings for syllabify
	sylls = sy.syllabify(sy.English, cmu_token)
	#for x in sylls:
	#	print x
	return sylls
Example #2
0
def wcm(phonemes, *sylab):
    phonemes = translator(phonemes)
    syls = syllabify(phonemes) 
    score = 0

    if len(syls) > 2:
        score += 1                                                  # Productions with more than two syllables receive 1 point
    if len(syls) > 1 and not syls[0][1][-1].endswith('1'):
        score += 1                                                  # Productions with stress on any syllable but the first receive
    if syls[-1][2] != []:
        score += 1                                                  # Productions with a word-final consonant receive 1 point
    for syl in syls:
        if len(syl[0]) > 1:
            score += 1                                              # Productions with a sequence of two or more consonants within
        if len(syl[2]) > 1:                                         # a syllable receive one point for each cluster
            score += 1

    for syl in syls:
        score += sum(ph in DORSALS for ph in (syl[0] + syl[2]))     # Productions with a velar consonant receive 1 point for each
    for syl in syls:
        score += sum(ph in LIQUIDS for ph in (syl[0] + syl[2]))     # Productions with a liquid, a syllabic liquid, or a rhotic vowel
                                                                    # receive 1 point for each liquid, syllabic liquid, and rhotic vowel

        score += sum(len(ph) > 1 and ph[1] == 'R' for ph in syl[1]) # Productions with a fricative or affricate receive 1 point for
                                                                    # each fricative and affricate
        score += sum(ph in AF for ph in (syl[0] + syl[2]))
    for syl in syls:
        score += sum(ph in VOICED_AF for ph in (syl[0] + syl[2]))   # Productions with a voiced fricative or affricate receive 1 point
                                                                    # for each fricative and affricate (in addition to the point received
                                                                    # for #3)
    return score
Example #3
0
def syllabify_cmu(cmu_token):
	import syllabify as sy
	cmu_token=cmu_token.replace(' 2','2').replace(' 1','1') # fix prim/sec stress markings for syllabify
	sylls = sy.syllabify(sy.English, cmu_token)
	#for x in sylls:
	#	print x
	return sylls
def persistent(w, lemma):
    w = w.replace("|", "")

    place, accent = get_accent_type(lemma)
    s = syllabify(w)
    possible = list(possible_accentuations(s))
    place2 = len(s) - len(syllabify(lemma)) + place
    accent_type = (place2, accent)
    if accent_type not in possible:
        if accent == ACUTE and (place2, CIRCUMFLEX) in possible:
            accent_type = (place2, CIRCUMFLEX)
        else:
            for i in range(1, 4):
                if (place2 - i, ACUTE) in possible:
                    accent_type = (place2 - i, ACUTE)
                    break
    return add_accent(s, accent_type)
Example #5
0
def wcm(phonemes, *sylab):
    """
    The "Word Complexity Measure", as proposed in:

    C. Stoel-Gammon. 2010. The Word Complexity Measure: Description and 
    application to developmental phonology and disorders. Clinical
    Linguistics and Phonetics 24(4-5): 271-282.
    """
    syls = syllabify(phonemes) 
    # begin scoring
    score = 0
    ## Word patterns
    # (1) Productions with more than two syllables receive 1 point
    if len(syls) > 2:
        score += 1
    # FIXME <stupid_rule>
    # (2) Productions with stress on any syllable but the first receive 
    # 1 point [this rule is stupid --KG]
    if len(syls) > 1 and not syls[0][1][-1].endswith('1'):
        score += 1
    # FIXME </stupid_rule>
    ## Syllable structures
    # (1) Productions with a word-final consonant receive 1 point
    if syls[-1][2] != []:
        score += 1
    # (2) Productions with a syllable cluster (defined as a sequence of 
    # two or more consonants within a syllable) receive one point for 
    # each cluster:
    for syl in syls:
        if len(syl[0]) > 1:
            score += 1
        if len(syl[2]) > 1:
            score += 1
    ## Sound classes
    # (1) Productions with a velar consonant receive 1 point for each 
    # velar
    for syl in syls:
        score += sum(ph in DORSALS for ph in (syl[0] + syl[2]))
    # (2) Productions with a liquid, a syllabic liquid, or a rhotic vowel 
    # receive 1 point for each liquid, syllabic liquid, and rhotic vowel
    for syl in syls:
        score += sum(ph in LIQUIDS for ph in (syl[0] + syl[2]))
        score += sum(len(ph) > 1 and ph[1] == 'R' for ph in syl[1])
    # (3) Productions with a fricative or affricate receive 1 point for
    # each fricative and affricate
        score += sum(ph in AF for ph in (syl[0] + syl[2]))
    # (4) Productions with a voiced fricative or affricate receive 1 point
    # for each fricative and affricate (in addition to the point received
    # for #3)
    for syl in syls:
        score += sum(ph in VOICED_AF for ph in (syl[0] + syl[2]))
    # and we're done
    return score
Example #6
0
def wcm(phonemes, *sylab):
    """
    The "Word Complexity Measure", as proposed in:

    C. Stoel-Gammon. 2010. The Word Complexity Measure: Description and 
    application to developmental phonology and disorders. Clinical
    Linguistics and Phonetics 24(4-5): 271-282.
    """
    syls = syllabify(phonemes)
    # begin scoring
    score = 0
    ## Word patterns
    # (1) Productions with more than two syllables receive 1 point
    if len(syls) > 2:
        score += 1
    # FIXME <stupid_rule>
    # (2) Productions with stress on any syllable but the first receive
    # 1 point [this rule is stupid --KG]
    if len(syls) > 1 and not syls[0][1][-1].endswith('1'):
        score += 1
    # FIXME </stupid_rule>
    ## Syllable structures
    # (1) Productions with a word-final consonant receive 1 point
    if syls[-1][2] != []:
        score += 1
    # (2) Productions with a syllable cluster (defined as a sequence of
    # two or more consonants within a syllable) receive one point for
    # each cluster:
    for syl in syls:
        if len(syl[0]) > 1:
            score += 1
        if len(syl[2]) > 1:
            score += 1
    ## Sound classes
    # (1) Productions with a velar consonant receive 1 point for each
    # velar
    for syl in syls:
        score += sum(ph in DORSALS for ph in (syl[0] + syl[2]))
    # (2) Productions with a liquid, a syllabic liquid, or a rhotic vowel
    # receive 1 point for each liquid, syllabic liquid, and rhotic vowel
    for syl in syls:
        score += sum(ph in LIQUIDS for ph in (syl[0] + syl[2]))
        score += sum(len(ph) > 1 and ph[1] == 'R' for ph in syl[1])
        # (3) Productions with a fricative or affricate receive 1 point for
        # each fricative and affricate
        score += sum(ph in AF for ph in (syl[0] + syl[2]))
    # (4) Productions with a voiced fricative or affricate receive 1 point
    # for each fricative and affricate (in addition to the point received
    # for #3)
    for syl in syls:
        score += sum(ph in VOICED_AF for ph in (syl[0] + syl[2]))
    # and we're done
    return score
Example #7
0
 def test_corpus_valid_syllables(self):
     corpus_words = read_corpus_words('../data/Ryan_Latin_master.txt')
     for word in corpus_words:
         syllables = syllabify(word)
         for syllable in syllables:
             try:
                 syl_type = identify_syllable_type(syllable)
                 self.assertIn(syl_type, SYLLABLE_TYPES)
             except ValueError as e:
                 print('Word: ' + word)
                 print('Syllables: ' + str(syllables))
                 raise e
def recessive(w, treat_final_AI_OI_short=True, default_short=False):
    if "|" in w:
        pre, w = w.split("|")
    else:
        pre = ""

    s = syllabify(w)
    return pre + add_accent(
        s,
        sorted(
            possible_accentuations(s, treat_final_AI_OI_short, default_short),
            reverse=True
        )[0]
    )
def on_penult(w, default_short=False):
    if "|" in w:
        pre, w = w.split("|")
    else:
        pre = ""

    s = syllabify(w)
    accentuations = list(
        possible_accentuations(s, default_short=default_short)
    )
    if PROPERISPOMENON in accentuations:
        return pre + add_accent(s, PROPERISPOMENON)
    elif PAROXYTONE in accentuations:
        return pre + add_accent(s, PAROXYTONE)
Example #10
0
def read_sample(sample, words=False, source='original'):
    lines = [line[source].split() for line in sample["text"]]

    # character-level models: char-level was trained on free running text
    # (so "original" doesn't have syllable boundaries)
    if sample.get('model', 'default').lower().startswith('char'):
        if words:
            return lines
        return [syllabify(line) for line in lines]

    # all other cases
    if words:
        lines = [[format_word(sylls) for sylls in group_syllables(line)]
                 for line in lines]

    return lines
Example #11
0
def transcribe(word):
    transcription = []
    cmuDict = nltk.corpus.cmudict.dict()
    try:
        syllabified = syllabify(cmuDict[word][0])
        for syllable in syllabified:
            dummyStr = ""
            for segment in syllable[0]:
                dummyStr += segment + " "
            for segment in syllable[1]:
                dummyStr += segment + " "
            for segment in syllable[2]:
                dummyStr += segment + " "
            transcription.append(dummyStr.strip())
        return transcription
    except Exception as e:
        return "NOT IN DICTIONARY"
Example #12
0
def preprocess_words(path: str) -> List[List[str]]:
    words = read_corpus_words(path)
    word_count = len(words)
    print(f'Read in {word_count} Latin words')
    syllabified_words = [syllabify(word) for word in words]
    real_syllabified_words = []
    skipped = []
    for word in syllabified_words:
        try:
            for syllable in word:
                cleaned_syllable = clean_syllable(syllable)
                syl_type = identify_syllable_type(cleaned_syllable)
            real_syllabified_words.append(word)
        except ValueError:
            skipped.append(''.join(word))
    print(f'Processed {len(real_syllabified_words)} out of {word_count} words')
    print(f'Skipped the following words: {str(skipped)}')

    return real_syllabified_words
Example #13
0
 def test_cvcv(self):
     word = 'tibi'
     syllables = syllabify(word)
     self.assertEqual(['ti', 'bi'], syllables)
Example #14
0
 def test_cvcvc(self):
     word = 'bonum'
     syllables = syllabify(word)
     self.assertEqual(['bo', 'num'], syllables)
Example #15
0
import syllabify

print "\nEXAMPLE 1"
#syllabify a word, default behavior
syllabified = syllabify.syllabify("hello")
print " ".join(syllabified)

print "\nEXAMPLE 2"
#default vowels are aeiouAEIOU
#syllabify with different vowels with the vowels argument
#this can be a list or a string or a set. sets are fastest
syllabified = syllabify.syllabify("happy")
print " ".join(syllabified)
syllabified = syllabify.syllabify("happy",
                                  vowels=set(["a", "e", "i", "o", "u", "y"]))
print " ".join(syllabified)

print "\nEXAMPLE 3"
#by default, vowels in hiatus are treated as independent nuclei
#specify which vowels form diphthongs with the diphthvowels argument
syllabified = syllabify.syllabify("look")
print " ".join(syllabified)
syllabified = syllabify.syllabify("look", diphthvowels=["o"])
print " ".join(syllabified)

print "\nEXAMPLE 4"
#by default, intervocalic clusters are split down the middle
#to specify onsets and coda to look for instead, use the onsets and codas arguments
#these expect sets or lists. sets are faster
#if no valid coda+onset combination exists, it restorts to default behavior
syllabified = syllabify.syllabify("fishing")
		## xx is an entry of cmuprosody, which we will now populate
	xx=[x[0],x[1]] #initialize the array; make it start off with the original cmu dictionary entry
		
		### Prosodic form array ###
	pform=[] #initialize an empty array - this will contain the prosodic form
	for e in x[1]: #loop through the 'array' of phonemes
		m=re.search('([012])',e) #search each one for a stress mark
		if m: #if you found one
			pform.append(m.group()) #append the stress mark to the prosodic form array
	xx.append(pform) # append the prosodic form to the entry
		
		### Syllabifying ###
	w = x[1]  #Now, create a new array of phonemes
	w = [re.sub('[0-9]', '', p) for p in w] #Strip out the numbers
	w = ' '.join(w)	#Make one string
	syl=syllabify(w) #syllabify the string
	syl=[re.sub('[(,)]','', p) for p in syl.split(' (')] #Split the syl into an array of strings, and remove parentheses
	subNasalsLiquids=1
	if subNasalsLiquids==1:
		for r in range(len(syl)):
			syl[r]=re.sub('AH [mn]','AN',syl[r])
			syl[r]=re.sub('AH l','AL',syl[r])
	xx.append(syl) #Now, append the syllabification
		
		### Making the syllabic form array ###
		#xx.append(syl) #For checking if this worked
	sform=[]		#Initialize an array to hold the syllabic forms
	for s in syl: #loop through the syllables
		ss=s.split() #split each syllable into segments
		ff=''		 #this is the string that holds the syllable's form
		for i in ss: #for each segment in the syllable
Example #17
0
def is_tense(word, pron):
    """
    True iff word `word` with pronuciation `pron` (represented as a list of
    ARPABET characters) has a tense short-a in the first syllable in the 
    "classic" Philadelphia pattern. The algorithm (for lack of a better
    term) is as follows:
    
    * Check whether the word is a positive exception to tensing: if so
      return True
    * Check whether the word is a negative exception to tensing: if so
      return False
    * Check whether the word is an indeterminate word (at the moment, just
      "CAN"): if so return None
    * Syllabify and extract the onset, nucleus, and coda of the first 
      syllable
    * Check whether the first-syllable nucleus is r-colored: if so return 
      False
    * Check whether the first coda consonant of the first syllable is 
      a tensing segment: if so return True
    * Check whether the word is two syllables, has an empty penultimate
      coda, but has an ultimate onset consisting of a tensing segment
      and ends in a suffix that triggers resyllabification in the classic
      system: so return True
    * Return False

    Load CMU dictionary for testing (NB: this does not have appropriate 
    handling for words with multiple dictionary entries)

    >>> pron = {}
    >>> for line in open("dict", "r"):
    ...     if line.startswith(';'):
    ...         continue
    ...     (word, pron_string) = line.rstrip().split('  ', 1)
    ...     pron[word] = pron_string.split()

    # and, because it's not in the dictionary...
    >>> pron['GLADDEST'] = pron['GLAD'] + ['EH0', 'S', 'T']

    Positive exceptions:
    >>> is_tense('MADDER', pron['MADNESS'])
    True
    >>> is_tense('BADNESS', pron['BADNESS'])
    True
    >>> is_tense('GLADDEST', pron['GLADDEST'])
    True

    Negative exceptions:
    >>> is_tense('RAN', pron['RAN'])
    False
    >>> is_tense('SWAM', pron['SWAM'])
    False
    >>> is_tense('MATH', pron['MATH'])
    False
    >>> is_tense('SAD', pron['SAD'])
    False
    
    Tautosyllabic /m, n/:
    >>> is_tense('HAND', pron['HAND'])
    True
    >>> is_tense('HAM', pron['HAM'])
    True

    Tautosyllabic /f, θ, s/:
    >>> is_tense('HALF', pron['HALF'])
    True
    >>> is_tense('PATH', pron['HALF'])
    True
    >>> is_tense('PASS', pron['PASS'])
    True

    Closed syllables that go without:
    >>> is_tense('CASH', pron['CASH'])
    False
    >>> is_tense('BANG', pron['BANG'])
    False
    >>> is_tense('BAT', pron['BAT'])
    False
    >>> is_tense('BAG', pron['BAG'])
    False
    >>> is_tense('CAB', pron['CAB'])
    False

    Open syllables:
    >>> is_tense('HAMMER', pron['HAMMER'])
    False
    >>> is_tense('MANAGE', pron['MANAGE'])
    False
    >>> is_tense('MANAGED', pron['MANAGED'])
    False

    Opaque tensing in (re)open(ed) syllables:
    >>> is_tense('MANNING', pron['MANNING'])
    True
    >>> is_tense('MASSES', pron['MASSES'])
    True
    >>> is_tense('ASKING', pron['ASKING'])
    True
    >>> is_tense("PASSIN'", pron["PASSIN'"]) # Did we catch "-in'"?
    True

    (lexically) Unclassifiable:
    >>> is_tense('CAN', pron['CAN'])
    >>> is_tense('BEGAN', pron['BEGAN'])
    >>> is_tense('PAL', pron['PAL'][1:])
    >>> is_tense('SALAD', pron['SALAD'][1:])
    >>> is_tense('PLANETS', pron['PLANETS'])
    >>> is_tense("PLANET'S", pron["PLANET'S"])

    Formerly unclassifiable sC:
    >>> is_tense('ASPECT', pron['ASPECT'])
    False
    >>> is_tense('CASKET', pron['CASKET'])
    True
    >>> is_tense('ASKED', pron['ASKED'])
    True
    >>> is_tense('BASKETBALL', pron['BASKETBALL'])
    True

    Previously incorrectly marked as "unclassifiable":
    >>> is_tense('BANDSTAND', pron['BANDSTAND'])
    True
    >>> is_tense('BACKSTROKE', pron['BACKSTROKE'])
    False
    
    Previously incorrectly marked as 'lax':
    >>> is_tense('PROGRAM', pron['PROGRAM'][5:])
    True
    >>> is_tense('TRANSFER', pron['TRANSFER'])
    True
 
    Not handled programmatically yet: schwa-apocope (CAMERA), /t/ deleted (SANTA)
    """
    # normalize wordforms for lookup
    if word.endswith("IN'"):
        word = word[:-1] + 'G'
    elif word.endswith("'S"):
        word = word[:-2]
    elif word.endswith("'"):
        word = word[:-1]
    # check lexical exceptions
    if word in UNCLASSIFIABLE:
        return None
    if word in POSITIVE_EXCEPTIONS:
        return True
    if word in NEGATIVE_EXCEPTIONS:
        return False
    # exclude pre-/l/ tokens
    if pron[1] == 'L':
        return None
    # parse syllables, with "Alaska rule" ON
    syls = syllabify(pron)
    (onset, nucleus, coda) = syls[0]
    # we assume that R is parsed into the nucleus in certain contexts; in
    # this case the vowel is lax regardless of the coda's contents
    if len(nucleus) > 1 and nucleus[1] == 'R':
        return False
    # check for tautosyllabic tensing segment at the start of the coda
    if len(coda) > 0:
        if coda[0] in TENSERS:
            return True
    # check for the possibility of resyllabification opacifying tensing
    if len(syls) == 2 and not coda:
        if is_penultimate_syllable_resyllabified(word):
            resyl_onset = syls[1][0]
            if len(resyl_onset) == 1 and resyl_onset[0] in TENSERS:
                return True
    return False
def make_properispomenon(w):
    s = syllabify(w)
    if PROPERISPOMENON in possible_accentuations(s):
        return add_accent(s, PROPERISPOMENON)
    else:
        return add_accent(s, PAROXYTONE)
def make_oxytone(w):
    return add_accent(syllabify(w), OXYTONE)
Example #20
0
else:
    f = sys.stdin

if args.cmusource:
    source = open(args.cmusource, "rt")
else:
    source = open("/tmp/cmudict", "rt")

dic = {}
for line in source:
    if line[0] == ';':  # header, commenst
        continue
    (word, pron) = line.rstrip().split('  ', 1)
    dic[word.lower()] = pron

for line in f.readlines():
    word = line.rstrip().lower()
    if word in dic:
        pron = dic[word]
        try:
            syllables = syllabify(pron.split())
            if args.verbose:
                print("word: {}\nnumber of syllables: {}\nsyllabification: {}".
                      format(word, len(syllables), pprint(syllables)))
            else:
                print(pprint(syllables))
        except ValueError as e:
            eprint(str(e))
    else:
        print("{} not in dic".format(line.lower()))
Example #21
0
 def test_v(self):
     word = 'a'
     syllables = syllabify(word)
     self.assertEqual(['a'], syllables)
Example #22
0
 def test_vivc(self):
     word = 'eius'
     syllables = syllabify(word)
     self.assertEqual(['e', 'ius'], syllables)
Example #23
0
 def test_cviv(self):
     word = 'caia'
     syllables = syllabify(word)
     self.assertEqual(['ca', 'ia'], syllables)
Example #24
0
 def test_clvcvcc(self):
     word = 'placent'
     syllables = syllabify(word)
     self.assertEqual(['pla', 'cent'], syllables)
Example #25
0
 def test_crvc(self):
     word = 'tres'
     syllables = syllabify(word)
     self.assertEqual(['tres'], syllables)
Example #26
0
 def test_vcvv_ia(self):
     word = 'alia'
     syllables = syllabify(word)
     self.assertEqual(['a', 'li', 'a'], syllables)
Example #27
0
 def test_cvrcvc(self):
     word = 'partes'
     syllables = syllabify(word)
     self.assertEqual(['par', 'tes'], syllables)
Example #28
0
 def test_cvv_ei(self):
     word = 'dei'
     syllables = syllabify(word)
     self.assertEqual(['de', 'i'], syllables)
def make_proparoxytone(w):
    return add_accent(syllabify(w), PROPAROXYTONE)
Example #30
0
 def test_dipthm(self):
     word = 'coe-git'
     syllables = syllabify(word)
     self.assertEqual(['co', 'e-', 'git'], syllables)
Example #31
0
def is_tense(word, pron):
    """
    True iff word `word` with pronuciation `pron` (represented as a list of
    ARPABET characters) has a tense short-a in the first syllable in the 
    "classic" Philadelphia pattern. The algorithm (for lack of a better
    term) is as follows:
    
    * Check whether the word is a positive exception to tensing: if so
      return True
    * Check whether the word is a negative exception to tensing: if so
      return False
    * Check whether the word is an indeterminate word (at the moment, just
      "CAN"): if so return None
    * Syllabify and extract the onset, nucleus, and coda of the first 
      syllable
    * Check whether the first-syllable nucleus is r-colored: if so return 
      False
    * Check whether the first coda consonant of the first syllable is 
      a tensing segment: if so return True
    * Check whether the word is two syllables, has an empty penultimate
      coda, but has an ultimate onset consisting of a tensing segment
      and ends in a suffix that triggers resyllabification in the classic
      system: so return True
    * Return False

    Load CMU dictionary for testing (NB: this does not have appropriate 
    handling for words with multiple dictionary entries)

    >>> pron = {}
    >>> for line in open("dict", "r"):
    ...     if line.startswith(';'):
    ...         continue
    ...     (word, pron_string) = line.rstrip().split('  ', 1)
    ...     pron[word] = pron_string.split()

    # and, because it's not in the dictionary...
    >>> pron['GLADDEST'] = pron['GLAD'] + ['EH0', 'S', 'T']

    Positive exceptions:
    >>> is_tense('MADDER', pron['MADNESS'])
    True
    >>> is_tense('BADNESS', pron['BADNESS'])
    True
    >>> is_tense('GLADDEST', pron['GLADDEST'])
    True

    Negative exceptions:
    >>> is_tense('RAN', pron['RAN'])
    False
    >>> is_tense('SWAM', pron['SWAM'])
    False
    >>> is_tense('MATH', pron['MATH'])
    False
    >>> is_tense('SAD', pron['SAD'])
    False
    
    Tautosyllabic /m, n/:
    >>> is_tense('HAND', pron['HAND'])
    True
    >>> is_tense('HAM', pron['HAM'])
    True

    Tautosyllabic /f, θ, s/:
    >>> is_tense('HALF', pron['HALF'])
    True
    >>> is_tense('PATH', pron['HALF'])
    True
    >>> is_tense('PASS', pron['PASS'])
    True

    Closed syllables that go without:
    >>> is_tense('CASH', pron['CASH'])
    False
    >>> is_tense('BANG', pron['BANG'])
    False
    >>> is_tense('BAT', pron['BAT'])
    False
    >>> is_tense('BAG', pron['BAG'])
    False
    >>> is_tense('CAB', pron['CAB'])
    False

    Open syllables:
    >>> is_tense('HAMMER', pron['HAMMER'])
    False
    >>> is_tense('MANAGE', pron['MANAGE'])
    False
    >>> is_tense('MANAGED', pron['MANAGED'])
    False

    Opaque tensing in (re)open(ed) syllables:
    >>> is_tense('MANNING', pron['MANNING'])
    True
    >>> is_tense('MASSES', pron['MASSES'])
    True
    >>> is_tense('ASKING', pron['ASKING'])
    True
    >>> is_tense("PASSIN'", pron["PASSIN'"]) # Did we catch "-in'"?
    True

    (lexically) Unclassifiable:
    >>> is_tense('CAN', pron['CAN'])
    >>> is_tense('BEGAN', pron['BEGAN'])
    >>> is_tense('PAL', pron['PAL'][1:])
    >>> is_tense('SALAD', pron['SALAD'][1:])
    >>> is_tense('PLANETS', pron['PLANETS'])
    >>> is_tense("PLANET'S", pron["PLANET'S"])

    Formerly unclassifiable sC:
    >>> is_tense('ASPECT', pron['ASPECT'])
    False
    >>> is_tense('CASKET', pron['CASKET'])
    True
    >>> is_tense('ASKED', pron['ASKED'])
    True
    >>> is_tense('BASKETBALL', pron['BASKETBALL'])
    True

    Previously incorrectly marked as "unclassifiable":
    >>> is_tense('BANDSTAND', pron['BANDSTAND'])
    True
    >>> is_tense('BACKSTROKE', pron['BACKSTROKE'])
    False
    
    Previously incorrectly marked as 'lax':
    >>> is_tense('PROGRAM', pron['PROGRAM'][5:])
    True
    >>> is_tense('TRANSFER', pron['TRANSFER'])
    True
 
    Not handled programmatically yet: schwa-apocope (CAMERA), /t/ deleted (SANTA)
    """
    # normalize wordforms for lookup
    if word.endswith("IN'"):
        word = word[:-1] + 'G'
    elif word.endswith("'S"):
        word = word[:-2]
    elif word.endswith("'"):
        word = word[:-1]
    # check lexical exceptions
    if word in UNCLASSIFIABLE:
        return None
    if word in POSITIVE_EXCEPTIONS:
        return True
    if word in NEGATIVE_EXCEPTIONS:
        return False    
    # exclude pre-/l/ tokens
    if pron[1] == 'L':
        return None
    # parse syllables, with "Alaska rule" ON 
    syls = syllabify(pron)
    (onset, nucleus, coda) = syls[0]
    # we assume that R is parsed into the nucleus in certain contexts; in 
    # this case the vowel is lax regardless of the coda's contents
    if len(nucleus) > 1 and nucleus[1] == 'R':
        return False
    # check for tautosyllabic tensing segment at the start of the coda
    if len(coda) > 0:
        if coda[0] in TENSERS:
            return True
    # check for the possibility of resyllabification opacifying tensing
    if len(syls) == 2 and not coda:
        if is_penultimate_syllable_resyllabified(word):
            resyl_onset = syls[1][0]
            if len(resyl_onset) == 1 and resyl_onset[0] in TENSERS:
                return True
    return False
Example #32
0
 def test_corpus_syllabified(self):
     corpus_words = read_corpus_words('../data/Ryan_Latin_master.txt')
     # print(f'Testing {len(corpus_words)} words...')
     for word in corpus_words:
         syllables = syllabify(word)
         self.assertGreaterEqual(len(syllables), 1)
Example #33
0
def syllabify_helper(pron):
    return syllabify.syllabify(pron)
Example #34
0
 def test_cvvc_eu(self):
     word = 'deum'
     syllables = syllabify(word)
     self.assertEqual(['de', 'um'], syllables)
Example #35
0
#!/usr/bin/env python3

from syllabify import syllabify, display_syllable

import unicodedata


def d(s):
    return unicodedata.normalize("NFD", s)


def strip_accents(s):
    return ''.join((c for c in d(s) if unicodedata.category(c) != "Mn"))


for line in open("analysis/enchiridion.txt"):
    word = line.strip().split()[2].strip("@")
    for syllable in syllabify(word):
        print(strip_accents(display_syllable(syllable).strip("’").lower()))
Example #36
0
def process_name(row, the_dict, syllable_dict, rhyme_dict, coda_dict, onset_dict):
	year = row[0]
	name = row[1].lower()
	sex = row[3]

	trans = the_dict[name][0]
	syls = syllabify(trans)
	mutable_syls = []
	for syl in syls:
		mutable_syl = [x for x in syl]
		mutable_syls.append(mutable_syl)
	mutable_syls[0][0] = ["#"] + mutable_syls[0][0]
	mutable_syls[-1][-1] = mutable_syls[-1][-1] + ["#"]		

	if year not in syllable_dict:
		syllable_dict[year] = {}
	if sex not in syllable_dict[year]:
		syllable_dict[year][sex] = {}

	if year not in rhyme_dict:
		rhyme_dict[year] = {}
	if sex not in rhyme_dict[year]:
		rhyme_dict[year][sex] = {}

	if year not in coda_dict:
		coda_dict[year] = {}
	if sex not in coda_dict[year]:
		coda_dict[year][sex] = {}

	if year not in onset_dict:
		onset_dict[year] = {}
	if sex not in onset_dict[year]:
		onset_dict[year][sex] = {}

	for syl in mutable_syls:
		syl[1] = [x.replace("AH0", "@") for x in syl[1]]
		rhyme_list = syl[1] + syl[2]
		rhyme_string = string.join(rhyme_list, " ")

		if rhyme_string in rhyme_dict[year][sex]:
			rhyme_dict[year][sex][rhyme_string] += 1
		else:
			rhyme_dict[year][sex][rhyme_string] = 1

		coda_string = string.join(syl[2], " ")
		if coda_string in coda_dict[year][sex]:
			coda_dict[year][sex][coda_string] += 1
		else:
			coda_dict[year][sex][coda_string] = 1


		onset_string = string.join(syl[0], " ")
		if onset_string in onset_dict[year][sex]:
			onset_dict[year][sex][onset_string] += 1
		else:
			onset_dict[year][sex][onset_string] = 1

		syllable_list = [item for sublist in syl for item in sublist]
		syllable_string = string.join(syllable_list, " ")
		if syllable_string in syllable_dict[year][sex]:
			syllable_dict[year][sex][syllable_string] += 1
		else:
			syllable_dict[year][sex][syllable_string] = 1

	return (syllable_dict, rhyme_dict, coda_dict, onset_dict)
Example #37
0
 def test_cvccvc(self):
     word = 'tantum'
     syllables = syllabify(word)
     self.assertEqual(['tan', 'tum'], syllables)
Example #38
0
 def test_cvccvv(self):
     word = 'gallia'
     syllables = syllabify(word)
     self.assertEqual(['gal', 'li', 'a'], syllables)
Example #39
0
from syllabify import syllabify
import count_syl as cs
from textstat.textstat import textstat

a = syllabify('hello')
# this doesn't really work, it's for ARPANET, not English words

b = cs.count_syllables('accident')
# this script seems to work pretty well, but gives lower and upper bound

c = textstat.syllable_count('fragmentation')
# ^ works well

def get_mnemonic_syllables(mn):
    return sum([textstat.syllable_count(a) for a in mn.split()])
Example #40
0
 def test_vccvc(self):
     word = 'omnes'
     syllables = syllabify(word)
     self.assertEqual(['om', 'nes'], syllables)
Example #41
0
 def test_cvmvc(self):
     word = 'de-us'
     syllables = syllabify(word)
     self.assertEqual(['de-', 'us'], syllables)
Example #42
0
 def test_vcvvc_iu(self):
     word = 'alium'
     syllables = syllabify(word)
     self.assertEqual(['a', 'li', 'um'], syllables)