def syllabify_cmu(cmu_token): import syllabify as sy cmu_token=cmu_token.replace(' 2','2').replace(' 1','1') # fix prim/sec stress markings for syllabify sylls = sy.syllabify(sy.English, cmu_token) #for x in sylls: # print x return sylls
def wcm(phonemes, *sylab): phonemes = translator(phonemes) syls = syllabify(phonemes) score = 0 if len(syls) > 2: score += 1 # Productions with more than two syllables receive 1 point if len(syls) > 1 and not syls[0][1][-1].endswith('1'): score += 1 # Productions with stress on any syllable but the first receive if syls[-1][2] != []: score += 1 # Productions with a word-final consonant receive 1 point for syl in syls: if len(syl[0]) > 1: score += 1 # Productions with a sequence of two or more consonants within if len(syl[2]) > 1: # a syllable receive one point for each cluster score += 1 for syl in syls: score += sum(ph in DORSALS for ph in (syl[0] + syl[2])) # Productions with a velar consonant receive 1 point for each for syl in syls: score += sum(ph in LIQUIDS for ph in (syl[0] + syl[2])) # Productions with a liquid, a syllabic liquid, or a rhotic vowel # receive 1 point for each liquid, syllabic liquid, and rhotic vowel score += sum(len(ph) > 1 and ph[1] == 'R' for ph in syl[1]) # Productions with a fricative or affricate receive 1 point for # each fricative and affricate score += sum(ph in AF for ph in (syl[0] + syl[2])) for syl in syls: score += sum(ph in VOICED_AF for ph in (syl[0] + syl[2])) # Productions with a voiced fricative or affricate receive 1 point # for each fricative and affricate (in addition to the point received # for #3) return score
def persistent(w, lemma): w = w.replace("|", "") place, accent = get_accent_type(lemma) s = syllabify(w) possible = list(possible_accentuations(s)) place2 = len(s) - len(syllabify(lemma)) + place accent_type = (place2, accent) if accent_type not in possible: if accent == ACUTE and (place2, CIRCUMFLEX) in possible: accent_type = (place2, CIRCUMFLEX) else: for i in range(1, 4): if (place2 - i, ACUTE) in possible: accent_type = (place2 - i, ACUTE) break return add_accent(s, accent_type)
def wcm(phonemes, *sylab): """ The "Word Complexity Measure", as proposed in: C. Stoel-Gammon. 2010. The Word Complexity Measure: Description and application to developmental phonology and disorders. Clinical Linguistics and Phonetics 24(4-5): 271-282. """ syls = syllabify(phonemes) # begin scoring score = 0 ## Word patterns # (1) Productions with more than two syllables receive 1 point if len(syls) > 2: score += 1 # FIXME <stupid_rule> # (2) Productions with stress on any syllable but the first receive # 1 point [this rule is stupid --KG] if len(syls) > 1 and not syls[0][1][-1].endswith('1'): score += 1 # FIXME </stupid_rule> ## Syllable structures # (1) Productions with a word-final consonant receive 1 point if syls[-1][2] != []: score += 1 # (2) Productions with a syllable cluster (defined as a sequence of # two or more consonants within a syllable) receive one point for # each cluster: for syl in syls: if len(syl[0]) > 1: score += 1 if len(syl[2]) > 1: score += 1 ## Sound classes # (1) Productions with a velar consonant receive 1 point for each # velar for syl in syls: score += sum(ph in DORSALS for ph in (syl[0] + syl[2])) # (2) Productions with a liquid, a syllabic liquid, or a rhotic vowel # receive 1 point for each liquid, syllabic liquid, and rhotic vowel for syl in syls: score += sum(ph in LIQUIDS for ph in (syl[0] + syl[2])) score += sum(len(ph) > 1 and ph[1] == 'R' for ph in syl[1]) # (3) Productions with a fricative or affricate receive 1 point for # each fricative and affricate score += sum(ph in AF for ph in (syl[0] + syl[2])) # (4) Productions with a voiced fricative or affricate receive 1 point # for each fricative and affricate (in addition to the point received # for #3) for syl in syls: score += sum(ph in VOICED_AF for ph in (syl[0] + syl[2])) # and we're done return score
def test_corpus_valid_syllables(self): corpus_words = read_corpus_words('../data/Ryan_Latin_master.txt') for word in corpus_words: syllables = syllabify(word) for syllable in syllables: try: syl_type = identify_syllable_type(syllable) self.assertIn(syl_type, SYLLABLE_TYPES) except ValueError as e: print('Word: ' + word) print('Syllables: ' + str(syllables)) raise e
def recessive(w, treat_final_AI_OI_short=True, default_short=False): if "|" in w: pre, w = w.split("|") else: pre = "" s = syllabify(w) return pre + add_accent( s, sorted( possible_accentuations(s, treat_final_AI_OI_short, default_short), reverse=True )[0] )
def on_penult(w, default_short=False): if "|" in w: pre, w = w.split("|") else: pre = "" s = syllabify(w) accentuations = list( possible_accentuations(s, default_short=default_short) ) if PROPERISPOMENON in accentuations: return pre + add_accent(s, PROPERISPOMENON) elif PAROXYTONE in accentuations: return pre + add_accent(s, PAROXYTONE)
def read_sample(sample, words=False, source='original'): lines = [line[source].split() for line in sample["text"]] # character-level models: char-level was trained on free running text # (so "original" doesn't have syllable boundaries) if sample.get('model', 'default').lower().startswith('char'): if words: return lines return [syllabify(line) for line in lines] # all other cases if words: lines = [[format_word(sylls) for sylls in group_syllables(line)] for line in lines] return lines
def transcribe(word): transcription = [] cmuDict = nltk.corpus.cmudict.dict() try: syllabified = syllabify(cmuDict[word][0]) for syllable in syllabified: dummyStr = "" for segment in syllable[0]: dummyStr += segment + " " for segment in syllable[1]: dummyStr += segment + " " for segment in syllable[2]: dummyStr += segment + " " transcription.append(dummyStr.strip()) return transcription except Exception as e: return "NOT IN DICTIONARY"
def preprocess_words(path: str) -> List[List[str]]: words = read_corpus_words(path) word_count = len(words) print(f'Read in {word_count} Latin words') syllabified_words = [syllabify(word) for word in words] real_syllabified_words = [] skipped = [] for word in syllabified_words: try: for syllable in word: cleaned_syllable = clean_syllable(syllable) syl_type = identify_syllable_type(cleaned_syllable) real_syllabified_words.append(word) except ValueError: skipped.append(''.join(word)) print(f'Processed {len(real_syllabified_words)} out of {word_count} words') print(f'Skipped the following words: {str(skipped)}') return real_syllabified_words
def test_cvcv(self): word = 'tibi' syllables = syllabify(word) self.assertEqual(['ti', 'bi'], syllables)
def test_cvcvc(self): word = 'bonum' syllables = syllabify(word) self.assertEqual(['bo', 'num'], syllables)
import syllabify print "\nEXAMPLE 1" #syllabify a word, default behavior syllabified = syllabify.syllabify("hello") print " ".join(syllabified) print "\nEXAMPLE 2" #default vowels are aeiouAEIOU #syllabify with different vowels with the vowels argument #this can be a list or a string or a set. sets are fastest syllabified = syllabify.syllabify("happy") print " ".join(syllabified) syllabified = syllabify.syllabify("happy", vowels=set(["a", "e", "i", "o", "u", "y"])) print " ".join(syllabified) print "\nEXAMPLE 3" #by default, vowels in hiatus are treated as independent nuclei #specify which vowels form diphthongs with the diphthvowels argument syllabified = syllabify.syllabify("look") print " ".join(syllabified) syllabified = syllabify.syllabify("look", diphthvowels=["o"]) print " ".join(syllabified) print "\nEXAMPLE 4" #by default, intervocalic clusters are split down the middle #to specify onsets and coda to look for instead, use the onsets and codas arguments #these expect sets or lists. sets are faster #if no valid coda+onset combination exists, it restorts to default behavior syllabified = syllabify.syllabify("fishing")
## xx is an entry of cmuprosody, which we will now populate xx=[x[0],x[1]] #initialize the array; make it start off with the original cmu dictionary entry ### Prosodic form array ### pform=[] #initialize an empty array - this will contain the prosodic form for e in x[1]: #loop through the 'array' of phonemes m=re.search('([012])',e) #search each one for a stress mark if m: #if you found one pform.append(m.group()) #append the stress mark to the prosodic form array xx.append(pform) # append the prosodic form to the entry ### Syllabifying ### w = x[1] #Now, create a new array of phonemes w = [re.sub('[0-9]', '', p) for p in w] #Strip out the numbers w = ' '.join(w) #Make one string syl=syllabify(w) #syllabify the string syl=[re.sub('[(,)]','', p) for p in syl.split(' (')] #Split the syl into an array of strings, and remove parentheses subNasalsLiquids=1 if subNasalsLiquids==1: for r in range(len(syl)): syl[r]=re.sub('AH [mn]','AN',syl[r]) syl[r]=re.sub('AH l','AL',syl[r]) xx.append(syl) #Now, append the syllabification ### Making the syllabic form array ### #xx.append(syl) #For checking if this worked sform=[] #Initialize an array to hold the syllabic forms for s in syl: #loop through the syllables ss=s.split() #split each syllable into segments ff='' #this is the string that holds the syllable's form for i in ss: #for each segment in the syllable
def is_tense(word, pron): """ True iff word `word` with pronuciation `pron` (represented as a list of ARPABET characters) has a tense short-a in the first syllable in the "classic" Philadelphia pattern. The algorithm (for lack of a better term) is as follows: * Check whether the word is a positive exception to tensing: if so return True * Check whether the word is a negative exception to tensing: if so return False * Check whether the word is an indeterminate word (at the moment, just "CAN"): if so return None * Syllabify and extract the onset, nucleus, and coda of the first syllable * Check whether the first-syllable nucleus is r-colored: if so return False * Check whether the first coda consonant of the first syllable is a tensing segment: if so return True * Check whether the word is two syllables, has an empty penultimate coda, but has an ultimate onset consisting of a tensing segment and ends in a suffix that triggers resyllabification in the classic system: so return True * Return False Load CMU dictionary for testing (NB: this does not have appropriate handling for words with multiple dictionary entries) >>> pron = {} >>> for line in open("dict", "r"): ... if line.startswith(';'): ... continue ... (word, pron_string) = line.rstrip().split(' ', 1) ... pron[word] = pron_string.split() # and, because it's not in the dictionary... >>> pron['GLADDEST'] = pron['GLAD'] + ['EH0', 'S', 'T'] Positive exceptions: >>> is_tense('MADDER', pron['MADNESS']) True >>> is_tense('BADNESS', pron['BADNESS']) True >>> is_tense('GLADDEST', pron['GLADDEST']) True Negative exceptions: >>> is_tense('RAN', pron['RAN']) False >>> is_tense('SWAM', pron['SWAM']) False >>> is_tense('MATH', pron['MATH']) False >>> is_tense('SAD', pron['SAD']) False Tautosyllabic /m, n/: >>> is_tense('HAND', pron['HAND']) True >>> is_tense('HAM', pron['HAM']) True Tautosyllabic /f, θ, s/: >>> is_tense('HALF', pron['HALF']) True >>> is_tense('PATH', pron['HALF']) True >>> is_tense('PASS', pron['PASS']) True Closed syllables that go without: >>> is_tense('CASH', pron['CASH']) False >>> is_tense('BANG', pron['BANG']) False >>> is_tense('BAT', pron['BAT']) False >>> is_tense('BAG', pron['BAG']) False >>> is_tense('CAB', pron['CAB']) False Open syllables: >>> is_tense('HAMMER', pron['HAMMER']) False >>> is_tense('MANAGE', pron['MANAGE']) False >>> is_tense('MANAGED', pron['MANAGED']) False Opaque tensing in (re)open(ed) syllables: >>> is_tense('MANNING', pron['MANNING']) True >>> is_tense('MASSES', pron['MASSES']) True >>> is_tense('ASKING', pron['ASKING']) True >>> is_tense("PASSIN'", pron["PASSIN'"]) # Did we catch "-in'"? True (lexically) Unclassifiable: >>> is_tense('CAN', pron['CAN']) >>> is_tense('BEGAN', pron['BEGAN']) >>> is_tense('PAL', pron['PAL'][1:]) >>> is_tense('SALAD', pron['SALAD'][1:]) >>> is_tense('PLANETS', pron['PLANETS']) >>> is_tense("PLANET'S", pron["PLANET'S"]) Formerly unclassifiable sC: >>> is_tense('ASPECT', pron['ASPECT']) False >>> is_tense('CASKET', pron['CASKET']) True >>> is_tense('ASKED', pron['ASKED']) True >>> is_tense('BASKETBALL', pron['BASKETBALL']) True Previously incorrectly marked as "unclassifiable": >>> is_tense('BANDSTAND', pron['BANDSTAND']) True >>> is_tense('BACKSTROKE', pron['BACKSTROKE']) False Previously incorrectly marked as 'lax': >>> is_tense('PROGRAM', pron['PROGRAM'][5:]) True >>> is_tense('TRANSFER', pron['TRANSFER']) True Not handled programmatically yet: schwa-apocope (CAMERA), /t/ deleted (SANTA) """ # normalize wordforms for lookup if word.endswith("IN'"): word = word[:-1] + 'G' elif word.endswith("'S"): word = word[:-2] elif word.endswith("'"): word = word[:-1] # check lexical exceptions if word in UNCLASSIFIABLE: return None if word in POSITIVE_EXCEPTIONS: return True if word in NEGATIVE_EXCEPTIONS: return False # exclude pre-/l/ tokens if pron[1] == 'L': return None # parse syllables, with "Alaska rule" ON syls = syllabify(pron) (onset, nucleus, coda) = syls[0] # we assume that R is parsed into the nucleus in certain contexts; in # this case the vowel is lax regardless of the coda's contents if len(nucleus) > 1 and nucleus[1] == 'R': return False # check for tautosyllabic tensing segment at the start of the coda if len(coda) > 0: if coda[0] in TENSERS: return True # check for the possibility of resyllabification opacifying tensing if len(syls) == 2 and not coda: if is_penultimate_syllable_resyllabified(word): resyl_onset = syls[1][0] if len(resyl_onset) == 1 and resyl_onset[0] in TENSERS: return True return False
def make_properispomenon(w): s = syllabify(w) if PROPERISPOMENON in possible_accentuations(s): return add_accent(s, PROPERISPOMENON) else: return add_accent(s, PAROXYTONE)
def make_oxytone(w): return add_accent(syllabify(w), OXYTONE)
else: f = sys.stdin if args.cmusource: source = open(args.cmusource, "rt") else: source = open("/tmp/cmudict", "rt") dic = {} for line in source: if line[0] == ';': # header, commenst continue (word, pron) = line.rstrip().split(' ', 1) dic[word.lower()] = pron for line in f.readlines(): word = line.rstrip().lower() if word in dic: pron = dic[word] try: syllables = syllabify(pron.split()) if args.verbose: print("word: {}\nnumber of syllables: {}\nsyllabification: {}". format(word, len(syllables), pprint(syllables))) else: print(pprint(syllables)) except ValueError as e: eprint(str(e)) else: print("{} not in dic".format(line.lower()))
def test_v(self): word = 'a' syllables = syllabify(word) self.assertEqual(['a'], syllables)
def test_vivc(self): word = 'eius' syllables = syllabify(word) self.assertEqual(['e', 'ius'], syllables)
def test_cviv(self): word = 'caia' syllables = syllabify(word) self.assertEqual(['ca', 'ia'], syllables)
def test_clvcvcc(self): word = 'placent' syllables = syllabify(word) self.assertEqual(['pla', 'cent'], syllables)
def test_crvc(self): word = 'tres' syllables = syllabify(word) self.assertEqual(['tres'], syllables)
def test_vcvv_ia(self): word = 'alia' syllables = syllabify(word) self.assertEqual(['a', 'li', 'a'], syllables)
def test_cvrcvc(self): word = 'partes' syllables = syllabify(word) self.assertEqual(['par', 'tes'], syllables)
def test_cvv_ei(self): word = 'dei' syllables = syllabify(word) self.assertEqual(['de', 'i'], syllables)
def make_proparoxytone(w): return add_accent(syllabify(w), PROPAROXYTONE)
def test_dipthm(self): word = 'coe-git' syllables = syllabify(word) self.assertEqual(['co', 'e-', 'git'], syllables)
def test_corpus_syllabified(self): corpus_words = read_corpus_words('../data/Ryan_Latin_master.txt') # print(f'Testing {len(corpus_words)} words...') for word in corpus_words: syllables = syllabify(word) self.assertGreaterEqual(len(syllables), 1)
def syllabify_helper(pron): return syllabify.syllabify(pron)
def test_cvvc_eu(self): word = 'deum' syllables = syllabify(word) self.assertEqual(['de', 'um'], syllables)
#!/usr/bin/env python3 from syllabify import syllabify, display_syllable import unicodedata def d(s): return unicodedata.normalize("NFD", s) def strip_accents(s): return ''.join((c for c in d(s) if unicodedata.category(c) != "Mn")) for line in open("analysis/enchiridion.txt"): word = line.strip().split()[2].strip("@") for syllable in syllabify(word): print(strip_accents(display_syllable(syllable).strip("’").lower()))
def process_name(row, the_dict, syllable_dict, rhyme_dict, coda_dict, onset_dict): year = row[0] name = row[1].lower() sex = row[3] trans = the_dict[name][0] syls = syllabify(trans) mutable_syls = [] for syl in syls: mutable_syl = [x for x in syl] mutable_syls.append(mutable_syl) mutable_syls[0][0] = ["#"] + mutable_syls[0][0] mutable_syls[-1][-1] = mutable_syls[-1][-1] + ["#"] if year not in syllable_dict: syllable_dict[year] = {} if sex not in syllable_dict[year]: syllable_dict[year][sex] = {} if year not in rhyme_dict: rhyme_dict[year] = {} if sex not in rhyme_dict[year]: rhyme_dict[year][sex] = {} if year not in coda_dict: coda_dict[year] = {} if sex not in coda_dict[year]: coda_dict[year][sex] = {} if year not in onset_dict: onset_dict[year] = {} if sex not in onset_dict[year]: onset_dict[year][sex] = {} for syl in mutable_syls: syl[1] = [x.replace("AH0", "@") for x in syl[1]] rhyme_list = syl[1] + syl[2] rhyme_string = string.join(rhyme_list, " ") if rhyme_string in rhyme_dict[year][sex]: rhyme_dict[year][sex][rhyme_string] += 1 else: rhyme_dict[year][sex][rhyme_string] = 1 coda_string = string.join(syl[2], " ") if coda_string in coda_dict[year][sex]: coda_dict[year][sex][coda_string] += 1 else: coda_dict[year][sex][coda_string] = 1 onset_string = string.join(syl[0], " ") if onset_string in onset_dict[year][sex]: onset_dict[year][sex][onset_string] += 1 else: onset_dict[year][sex][onset_string] = 1 syllable_list = [item for sublist in syl for item in sublist] syllable_string = string.join(syllable_list, " ") if syllable_string in syllable_dict[year][sex]: syllable_dict[year][sex][syllable_string] += 1 else: syllable_dict[year][sex][syllable_string] = 1 return (syllable_dict, rhyme_dict, coda_dict, onset_dict)
def test_cvccvc(self): word = 'tantum' syllables = syllabify(word) self.assertEqual(['tan', 'tum'], syllables)
def test_cvccvv(self): word = 'gallia' syllables = syllabify(word) self.assertEqual(['gal', 'li', 'a'], syllables)
from syllabify import syllabify import count_syl as cs from textstat.textstat import textstat a = syllabify('hello') # this doesn't really work, it's for ARPANET, not English words b = cs.count_syllables('accident') # this script seems to work pretty well, but gives lower and upper bound c = textstat.syllable_count('fragmentation') # ^ works well def get_mnemonic_syllables(mn): return sum([textstat.syllable_count(a) for a in mn.split()])
def test_vccvc(self): word = 'omnes' syllables = syllabify(word) self.assertEqual(['om', 'nes'], syllables)
def test_cvmvc(self): word = 'de-us' syllables = syllabify(word) self.assertEqual(['de-', 'us'], syllables)
def test_vcvvc_iu(self): word = 'alium' syllables = syllabify(word) self.assertEqual(['a', 'li', 'um'], syllables)