def initial_consonants(): for word in cmudict.words(): initial = '' for char in word: if char not in 'aeiou' and char in string.ascii_letters: initial += char else: if len(initial) != 0: yield initial break
def final_consonants(): for word in cmudict.words(): final = '' for char in word[::-1]: if char not in 'aeiou' and char in string.ascii_letters: final += char else: if len(final) != 0: yield final[::-1] break
def __init__(self, order): # order of the model self.order = order # prior distribution over parameters (initially discount=0.8, strength=1.0) self.prior = PYPrior(0.8, 1.0) # back-off distribution if order == 1: self.backoff = G0Uniform(len(cmudict.words()) + 2) else: self.backoff = HPYLM(order - 1) # mapping of contexts to the corresponding Pitman-Yor Process self.u2pyp = {}
def CreatePickle(AlgQuiet=False): def SyllableCount(AszWord): """return the max syllable count in the case of multiple pronunciations""" #http://groups.google.com/group/nltk-users/msg/81e70cb6704dc01e?pli=1 return [ len([y for y in x if isdigit(y[-1])]) for x in GzzCMUDict[AszWord.lower()] ] # try: # LhaInputFile = open('cmudict','r+') # except: # print "Could not open the cmudict file" # raise IOError try: for LszLine in cmudict.words(): LszWord = LszLine.split(' ')[0].lower() LliSyllableList = SyllableCount(LszWord) if LszWord not in GdcSyllableCount: GdcSyllableCount[LszWord] = sorted(LliSyllableList) if not AlgQuiet: print "%-20s added %s" % (LszWord, LliSyllableList) else: if not AlgQuiet: print " -Word (%s) found twice. First count was %s, second was %s" % ( LszWord, GdcSyllableCount[LszWord], LliSyllableList) except: print "An error was encountered processing the file." raise IOError try: #----- # Now write the dictionary away to a new pickle file LhaOutputFile = open('cmusyllables.pickle', 'w') if not AlgQuiet: print "Finished processing input file\n\nNow dumping pickle file\n" pickle.dump(GdcSyllableCount, LhaOutputFile, -1) if not AlgQuiet: print "Pickle file cmusyllables.pickle has been created." except: print "An error was encountered writing the pickle file." raise IOError
def get_pronunciations(word): try: pronunciations = dictionary[word] except KeyError: #Fuzzy matching on words to find closest pronunciations = dictionary[get_close_matches(word, cmudict.words(), 1)[0]] #Other options to make this more accurate: #Break many syllable words into likely part-words #Try all combos (stress/syllables only) #Add from shakespeare sonnets #Add from limericks #Add manually #Could also bias this for fewer changes near the end of the word for the sake of rhyme only. return pronunciations
def CreatePickle(AlgQuiet=False): def SyllableCount(AszWord): """return the max syllable count in the case of multiple pronunciations""" #http://groups.google.com/group/nltk-users/msg/81e70cb6704dc01e?pli=1 return [len([y for y in x if isdigit(y[-1])]) for x in GzzCMUDict[AszWord.lower()]] # try: # LhaInputFile = open('cmudict','r+') # except: # print "Could not open the cmudict file" # raise IOError try: for LszLine in cmudict.words(): LszWord = LszLine.split(' ')[0].lower() LliSyllableList = SyllableCount(LszWord) if LszWord not in GdcSyllableCount: GdcSyllableCount[LszWord] = sorted(LliSyllableList) if not AlgQuiet: print "%-20s added %s" % (LszWord, LliSyllableList) else: if not AlgQuiet: print " -Word (%s) found twice. First count was %s, second was %s" % (LszWord, GdcSyllableCount[LszWord], LliSyllableList) except: print "An error was encountered processing the file." raise IOError try: #----- # Now write the dictionary away to a new pickle file LhaOutputFile = open('cmusyllables.pickle','w') if not AlgQuiet: print "Finished processing input file\n\nNow dumping pickle file\n" pickle.dump(GdcSyllableCount, LhaOutputFile,-1) if not AlgQuiet: print "Pickle file cmusyllables.pickle has been created." except: print "An error was encountered writing the pickle file." raise IOError
def vowels(): for word in cmudict.words(): vowel = '' vowel_started = False for char in word: if char in 'aeiou': vowel_started = True vowel += char else: if vowel_started: yield vowel break vowel = '' vowel_started = False for char in word[::-1]: if char in 'aeiou': vowel_started = True vowel += char else: if vowel_started: yield vowel[::-1] break
import os.path import pickle import random import sys from collections import Counter, defaultdict from curses.ascii import isdigit from itertools import islice from nltk.corpus import cmudict from nltk.tokenize import RegexpTokenizer VOWELS = "AEIOU" CONSONANTS = "BCDFGHJKLMNPQRSTVWXYZ" BORDER = "-----------------------------------" phonedict = cmudict.dict() cmuwords = cmudict.words() class TextHandler: def __init__(self, order, files, rhyme_file, scary_file): self.order = order self.files = files self.scary_words = self.read_scary_words(scary_file) self.rhyme_dict = self.load_rhyme_dict(rhyme_file) def get_matrix(self): content_text = self.merge_text(self.files) tokenizer = RegexpTokenizer(r"[\w\']+") corpus_words = tokenizer.tokenize(content_text) reverse_matrix = self.create_reverse_matrix(corpus_words) return reverse_matrix
# A sample logging configuration. The only tangible logging # performed by this configuration is to send an email to # the site admins on every HTTP 500 error. # See http://docs.djangoproject.com/en/dev/topics/logging for # more details on how to customize your logging configuration. LOGGING = { 'version': 1, 'disable_existing_loggers': False, 'handlers': { 'mail_admins': { 'level': 'ERROR', 'class': 'django.utils.log.AdminEmailHandler' } }, 'loggers': { 'django.request': { 'handlers': ['mail_admins'], 'level': 'ERROR', 'propagate': True, }, } } from nltk.corpus import cmudict WORDS = cmudict.words() from localsettings import *
def getWordSyllablesLessOrEq(syllableNum): word = random.choice(cmudict.words()) while countSyllables(d[word][0]) > syllableNum: word = random.choice(cmudict.words()) return word
''' this python code implements max match algorithm - it segments/tokenizes words from a sentence without any spaces/delimeters. Uses a pre defined dictionary of words uses a greedy approach - tries to match the longest possible word from string. if no word matches, creates single character as a word works very well on chinese language, not so well on english ''' __author__ = 'nishant' from nltk.corpus import cmudict WORD_LIST = cmudict.words() def max_match(string): length = len(string) # base condition of recursion if length == 0: return [] for i in range(length): first_word = string[:length-i] remaining_string = string[length-i:] if first_word in WORD_LIST: return [first_word] + max_match(remaining_string) # if no word matches, we consider the first character as a single word and apply max match recursively on remaining string first_word = string[0] remaining_string = string[1:]
print "Convert numbers into words..." numbers = [str(x) for x in tokens if x.isdigit()] words = [num2words(num) for num in numbers] numwords = dict(zip(numbers, words)) tokens_nw = [numwords[x] if x in numwords else x for x in tokens] #============================================================================== # Normalise pronunciation to US English and correct misspellings. # Get the brown and cmudict corpora and count information. # Will use brown corpus for correction instead of cmudict as the latter # does not provide frequency information. #============================================================================== print "Load corpora..." dict_brown = Counter(brown.words()) dict_cmu = Counter(cmudict.words()) # some additional known strings dict_mine = Counter( ["mrs", "miss", "mr", "dr", "prof", "dr.", "prof.", "\n", "lt.", '"']) full_dict = set(dict_brown + dict_cmu + dict_mine) print "Correct spelling and US grammar..." tokens_sp = [x if x in full_dict else spell(x, dict_brown) for x in tokens_nw] text_sp = ' '.join(word for word in tokens_sp) # Format text text_clean = formatter(text_sp) print "Write cleaned text to 'cleaned_text.txt'..." with io.open('./cleaned_text.txt', 'w', encoding='utf-8') as fid: fid.write(text_clean)
#!/usr/bin/env python #----------------# # All lines in EEBO: ./scansion.py ../../data/eebo_tcp_MARCH_2015/just_lines/ #----------------# from nltk.corpus import cmudict from nltk.corpus import stopwords import sys, glob, codecs import Levenshtein as lev from collections import defaultdict i = sys.argv[1] output = sys.argv[2] prondict = cmudict.dict() cmuwords = cmudict.words() def just_stress( word ): #Find a word in cmudict and return the numerical stresses for that word prons = prondict[word] stress = [] if len( prons ) > 1: #For one-syllable words, prefer a 0 over a 1 (if one of prondict's options is a 0) possibles = [] for s in prons: possible_stress = ''.join([ ''.join([char for char in syllable if char.isdigit()]) for syllable in s
def test_closest_word_krypton(self): words = cmudict.words() c_start = binary_search(words, 'c', lambda x, y: 0 if x == y else 1 if x > y else -1) c_end = binary_search(words, 'd', lambda x, y: 0 if x == y else 1 if x > y else -1) assert self.calculator.closest_words("KRYPTON", words[c_start:c_end]) == ['crippen', 'crypto']
import pickle from nltk.corpus import cmudict from itertools import islice from nltk.tokenize import RegexpTokenizer from collections import Counter, defaultdict from curses.ascii import isdigit VOWELS = "AEIOU" CONSONANTS = "BCDFGHJKLMNPQRSTVWXYZ" order = int(sys.argv[1]) content_text = sys.stdin.read() tokenizer = RegexpTokenizer('\w+') phonedict = cmudict.dict() wordsdict = cmudict.words() def window_generator(seq, n=2): it = iter(seq) result = tuple(islice(it, n)) if len(result) == n: yield result for elem in it: result = result[1:] + (elem,) yield result def matching_phonemes(words, mode): phonemes = [] for word in words:
# from lyricsFetch import lyricsFetch from __future__ import division import re from nltk.corpus import cmudict import time import numpy as np import os cmud = cmudict.dict() cmuw = cmudict.words() vowels = [ 'AA', 'AH', 'AW', 'EH', 'EY', 'IH', 'OW', 'UH', 'AE', 'AO', 'AY', 'ER', 'IY', 'OY', 'UW' ] fixed = {} rhyme2words = {} def get_lyric_ngrams(artistCount, songCount, category='rock', ngram=3): lf = lyricsFetch('rock', artistCount, songCount) lyrics = [] for i in range(artistCount - 1): lyrics.append(lf.getNextLyricSet()[2]) allLines = [] for l in lyrics: lines_str = re.sub('\n\n', '\n', l) lines = lines_str.split('\n') lines = [re.sub(r'([,!?]|\.{3})', r' \1', x) for x in lines] for li in lines:
# -*- coding: utf-8 -*- import matplotlib matplotlib.use('TkAgg') import nltk ''' ◑ The CMU Pronouncing Dictionary contains multiple pronunciations for certain words. How many distinct words does it contain? What fraction of words in this dictionary have more than one possible pronunciation? ''' from nltk.corpus import cmudict prondict = cmudict.words() print len(prondict) print len(set(prondict)) print -(len(set(prondict)) - len(prondict)) * 1.0 / len(prondict)
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import cmudict # 存放着英语发音规则 ################################################################## ## 简单查看 print(cmudict.fileids()) # ['cmudict'] print(type(cmudict)) # <class 'nltk.corpus.reader.cmudict.CMUDictCorpusReader'> print(len(cmudict.words())) # 133737; 个英语单词 print(cmudict.words()[:5]) # ['a', 'a.', 'a', 'a42128', 'aaa'] ################################################################## ## entries() entries = nltk.corpus.cmudict.entries() print(len(entries)) # 133737 print(entries[:5]) # [('a', ['AH0']), ('a.', ['EY1']), ('a', ['EY1']), ('a42128', ['EY1', 'F', 'AO1', 'R', 'T', 'UW1', 'W', 'AH1', 'N', 'T', 'UW1', 'EY1', 'T']), ('aaa', ['T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1'])] for entry in entries[42371:42374]: print(entry) # ('fir', ['F', 'ER1']) # ('fire', ['F', 'AY1', 'ER0']) # ('fire', ['F', 'AY1', 'R']) ################################################################## ## dict() prondict = nltk.corpus.cmudict.dict() print(prondict['fire']) # [['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']] print(prondict['blog']) # 没有 blog, 会报错 # Traceback (most recent call last): # File "<stdin>", line 1, in <module> # KeyError: 'blog' prondict['blog'] = [['B', 'L', 'AA1', 'G']] # 自己添加 print(prondict['blog']) # [['B', 'L', 'AA1', 'G']]
#def that returns every phoneme after last stressed vowel for given word #Ex.: cakedays => EY1 + S def getrhymepart(word): phonemes = arpabet[word] for phoneme in reversed(range(len(phonemes[0]))): for vowel in range(len(vowels)): if str(phonemes[0][phoneme]) == str(vowels[vowel]): phonemeoffset = 0 - len(phonemes[0]) + phoneme rhymepart = phonemes[0][phonemeoffset:] rhymepart[0] = rhymepart[0][:-1] return rhymepart dividedscheme = [[2, 1], [2]] #For-loop to do all the stuff for word1 in cmudict.words(): if nsyl(word1) == [2]: print(rhymecount) print(word1) text_file.close() rhymecount = 0 text_file = open("2-2-syl.txt", "a") rhymepart1 = getrhymepart(word1) for word2 in cmudict.words(): if nsyl(word2) == [2]: rhymepart2 = getrhymepart(word2) if rhymepart1 == rhymepart2: n = text_file.write(word1 + "\n" + word2 + "\n" + "\n") rhymecount = rhymecount + 1
def closest_word_for_letter(target, first_letter, similar_phone_substitution_cost): first_letter = first_letter.lower() words = cmudict.words() letter_start = binary_search(words, first_letter, lambda x, y: 0 if x == y else 1 if x > y else -1) letter_end = binary_search(words, chr(ord(first_letter) + 1), lambda x, y: 0 if x == y else 1 if x > y else -1) return closest_word(target, words[letter_start:letter_end], similar_phone_substitution_cost)
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import cmudict # 存放着英语发音规则 ################################################################## ## 简单查看 print(cmudict.fileids()) # ['cmudict'] print( type(cmudict)) # <class 'nltk.corpus.reader.cmudict.CMUDictCorpusReader'> print(len(cmudict.words())) # 133737; 个英语单词 print(cmudict.words()[:5]) # ['a', 'a.', 'a', 'a42128', 'aaa'] ################################################################## ## entries() entries = nltk.corpus.cmudict.entries() print(len(entries)) # 133737 print( entries[:5] ) # [('a', ['AH0']), ('a.', ['EY1']), ('a', ['EY1']), ('a42128', ['EY1', 'F', 'AO1', 'R', 'T', 'UW1', 'W', 'AH1', 'N', 'T', 'UW1', 'EY1', 'T']), ('aaa', ['T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1'])] for entry in entries[42371:42374]: print(entry) # ('fir', ['F', 'ER1']) # ('fire', ['F', 'AY1', 'ER0']) # ('fire', ['F', 'AY1', 'R']) ################################################################## ## dict() prondict = nltk.corpus.cmudict.dict() print(prondict['fire']) # [['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']] print(prondict['blog']) # 没有 blog, 会报错 # Traceback (most recent call last): # File "<stdin>", line 1, in <module> # KeyError: 'blog' prondict['blog'] = [['B', 'L', 'AA1', 'G']] # 自己添加
def _build_vocabulary(self): for word in cmudict.words(): if not self.contains(word): self.word2id[word] = len(self.id2word) self.id2word.append(word)