Exemple #1
0
    def _build_wordset(clazz, obscurity_limit):
        # I'm sorry this method is so disgusting.
        # It's all in the cause of fast loading in the main case.

        from nltk import FreqDist

        # Ensure corpora are loaded.
        try:
            from nltk.corpus import cmudict
            cmudict.entries()
        except LookupError:
            print "CMUDict corpus not found. Downloading..."
            from nltk import download
            download('cmudict')
            print "[Done]"
        if obscurity_limit is not None:
            from nltk.corpus import brown
            try:
                brown.words()
            except LookupError:
                print "Brown corpus not found. Downloading...",
                from nltk import download
                download('brown')
                print "[Done]"

        words = cmudict.entries()
        if obscurity_limit is not None:
            freqs = FreqDist([w.lower() for w in brown.words()])
            words = sorted(words,
                           key=lambda x: freqs[x[0].lower()],
                           reverse=True)
            return words[:obscurity_limit]
        else:
            return list(words)
Exemple #2
0
def rhyme(inp, level):
    entries = cmudict.entries()
    syllables = [(word, syl) for word, syl in entries if word == inp]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
    return set(rhymes)
Exemple #3
0
    def get_rhymes(self, word):
        rhymes = []

        word_pronounciations = cmudict.dict()[word]
        for word_pronounciation in word_pronounciations:
            for rhyme, rhyme_pronounciation in cmudict.entries():
                if rhyme_pronounciation[-1] == word_pronounciation[-1]:
                    rhymes.append(rhyme)

        return rhymes
Exemple #4
0
def ex12():
  from nltk.corpus import cmudict
  entries = cmudict.entries()
  words = map(lambda (word, pron) : word, entries)
  distinct_words = set(words)
  fd = nltk.FreqDist(words)
  multi_prons = 0
  for key in fd.keys():
    if fd[key] == 1:
      break
    multi_prons = multi_prons + 1
  print "#-distinct words:", len(distinct_words)
  print "#-words with multiple prons:", multi_prons
Exemple #5
0
def produce_lexical_index(lexicon):
	"""returns a lexical index"""

	index = []
	pronunciations = cmudict.entries()

	# makes an index consisting of the pronunciations and meanings.

	for word in lexicon:
		prons = find_pronunciations(word, pronunciations)
		meanings = find_meanings(word)
		index.append((word, prons, meanings))

	return index
Exemple #6
0
def loadCmuDict(path):
    
    cmudict_entries = {}
    for word, phonomes in cmudict.entries():
        phonomes = deleteStress(phonomes)
        
        if not cmudict_entries.has_key(word):   # wordがなかった場合(新規追加)
            phonomes_list = []
            phonomes_list.append(phonomes)
            cmudict_entries[word] = phonomes_list
        else:                                   # すでにwordが登録済み(強形・弱形)の場合
            if not isDuplicated(phonomes, cmudict_entries[word]):
                cmudict_entries[word].append(phonomes)
    
    return cmudict_entries
def rhyme(inp, level):
	global rhyme_chunks
	if(not inp.isalpha()):
		return []
	inp=inp.lower()
	key=""
	entries=cmudict.entries()
	syllables = [(word, syl) for word, syl in entries if word == inp]
	if(len(syllables)>0):
		key=repr(syllables[0][1][-level:])
		if(key in rhyme_chunks):
			print("Skipping because "+inp+" has a memoized rhyme")
			return rhyme_chunks[key]
	else:
		print("Skipping because "+inp+" has no rhymes")
		return []
	myRhymes = []
	for (word, syllable) in syllables:
		myRhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
	if(len(syllables)>0): 
		rhyme_chunks[key]=list(set(myRhymes))
	return list(set(myRhymes))
Exemple #8
0
from nltk.corpus import cmudict
interested_words = ["what" , "f**k" , "shit"]
interested_prouns = [cmudict.dict()[word][0][-2:] for word in interested_words ]
print [word for word , proun in cmudict.entries() if proun[-2:] in interested_prouns] 
def ShowReadability():
    text.insert(END, "If this doesn't work, check NLTK is installed. If NLTK is installed, use nltk.download() to get cmudict and punkt sentence tokenizer. See Help for details \n\n\n")
    import nltk
    pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
    data = resultsbox.get(1.0,END)
    rawtext=nltk.regexp_tokenize(data, pattern)
    prepcolloc = (w.lower() for w in rawtext)
    text.delete(1.0, END)
    #sentences
    sentcountshort = 0
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    sents = sent_tokenizer.tokenize(data)    
    for sent in sents:
        if len(sent) < 2:
            sentcountshort = sentcountshort+1
    
    numsents = len(sents) - sentcountshort
    numwords = len(p.split(data))-1
    sentcountshort = 0
    
    text.insert(END, "\nIgnoring one word sentences (like numbering), there are ")
    text.insert(END, numsents)
    text.insert(END, " sentences with an average of ")
    averagewordspersentence = numwords/numsents
    text.insert(END, averagewordspersentence)
    text.insert(END, " words per sentence.\n\n")


    #set up syllable dictionary        
    from math import sqrt as squareroot
    from nltk.corpus import cmudict
    syllables = dict()
    numeral = re.compile(r'\d')
    for (word, phonemes) in cmudict.entries():
        word = word.lower()
        count = len([x for x in list(''.join(phonemes)) if x >= '0' and x <= '9'])
        if syllables.has_key(word):
            count = min(count, syllables[word])
        syllables[word] = count        

    #count syllables    
    numsyllables=0
    wordsnotincmu=dict()
    for word in prepcolloc:
        if word in syllables:
            numsyllables = numsyllables + syllables[word]
    else:
        wordsnotincmu[word] = 1
        
    #count three syllable words
    threesyllcount=0
    for word in prepcolloc:
        if word in syllables and syllables[word] > 2:
            threesyllcount = threesyllcount + syllables[word]
        

    #calculate number of letters and numbers
    letnumcount=0
    for word in rawtext:
        if word.isalpha():
            letnumcount=letnumcount + len(word)        

    #adapted from Java at http://www.editcentral.com/gwt1/EditCentral.html
	#Flesch    
	Flesch = 206.835 - (1.015 * numwords) / numsents - (84.6 * numsyllables) / numwords
	Flesch = "%.1f" % Flesch
	#Automated readability index
	ARI = (4.71 * letnumcount) / numwords + (0.5 * numwords) / numsents -21.43;
	ARI = "%.1f" % ARI

	#Flesch-Kincaid grade level
	FK = (0.39 * numwords) / numsents + (11.8 * numsyllables) / numwords - 15.59;
	FK = "%.1f" % FK

	#Coleman-Liau index
	CL = (5.89 * letnumcount) / numwords - (30.0 * numsents) / numwords - 15.8;
	CL = "%.1f" % CL

	#gunning fog
	GunningFog = 0.4 * ( numwords / numsents + (100.0 * threesyllcount) / numwords );
	GunningFog = "%.1f" %GunningFog    
	#SMOG
	smog = squareroot( threesyllcount * 30.0 / numsents ) + 3.0;
	smog = "%.1f" % smog
	
    text.insert(END, "Flesch: ")
    text.insert(END, Flesch)
    text.insert(END, "\n")
    text.insert(END, "Automated readability index: ")
    text.insert(END, ARI)
    text.insert(END, "\n")
    text.insert(END, "Flesch-Kincaid grade level: ")
    text.insert(END, FK)
    text.insert(END, "\n")

    text.insert(END, "Coleman-Liau index: ")
    text.insert(END, CL)
    text.insert(END, "\n")

    text.insert(END, "Gunning fog index: ")
    text.insert(END, GunningFog)
    text.insert(END, "\n")

    text.insert(END, "Smog: ")
    text.insert(END, smog)
    text.insert(END, "\n\n")
    
    text.insert(END, "Following words not included in analysis - syllable count is missing from the cmudict database:\n\n")
    for k,y in sorted(wordsnotincmu.items()):
        text.insert(END, k)
Exemple #10
0
#   2. Redistributions in binary form must reproduce the above copyright notice,
#      this list of conditions and the following disclaimer in the documentation
#      and/or other materials provided with the distribution.
#   3. The name of the author may not be used to endorse or promote products
#      derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR 
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

import cPickle as pickle
from nltk.corpus import cmudict
import re
syllables = dict()
numeral = re.compile(r'\d')
for (word, phonemes) in cmudict.entries():
    word = word.upper()
    count = len([x for x in list(''.join(phonemes)) if x >= '0' and x <= '9'])
    if syllables.has_key(word):
        count = min(count, syllables[word])
    syllables[word] = count
output = open('cmudict.pickle2', 'wb')
pickle.dump(syllables, output, pickle.HIGHEST_PROTOCOL)
output.close()
Exemple #11
0
def cmu_multi_pronounciation():
	"""calculates the proportion of words in the cmu dictionary that has more than 1 pronounciation"""
	freq = nltk.FreqDist([entry[0] for entry in cmudict.entries()])
	multiples = freq.B() - freq.Nr(1) # total_types - 1_pronounciation
	return multiples * 1.0 / freq.B()
Exemple #12
0
print(conll2007.sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0].tree())  # doctest: +SKIP
# for tree in ycoe.parsed_sents('cocuraC')[:4]:
#     print(tree)  # doctest: +SKIP
# word lists and lexicons
print(words.fileids())
print(words.words('en'))  # doctest: +ELLIPSIS
print(stopwords.fileids())  # doctest: +ELLIPSIS
print(stopwords.words('portuguese'))  # doctest: +ELLIPSIS
# nltk.download('names')
print(names.fileids())
print(names.words('male.txt'))  # doctest: +ELLIPSIS
print(names.words('female.txt'))  # doctest: +ELLIPSIS
# nltk.download('cmudict')
print(cmudict.entries()[653:659])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# Load the entire cmudict corpus into a Python dictionary:
transcr = cmudict.dict()
print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()
       ])  # doctest: +NORMALIZE_WHITESPACE
# categorized corpora
print(brown.categories())  # doctest: +NORMALIZE_WHITESPACE
print(movie_reviews.categories())
# nltk.download('reuters')
print(reuters.categories())  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
print(brown.categories('ca01'))
print(brown.categories(['ca01', 'cb01']))
print(reuters.categories('training/9865'))
print(reuters.categories(['training/9865', 'training/9880']))
print(reuters.fileids('barley'))  # doctest: +ELLIPSIS
print(brown.tagged_words(categories='news'))
from nltk.corpus import cmudict
from gensim.models import Word2Vec

dic = dict(cmudict.entries())
model = Word2Vec.load('data/rap.model')
print "here we go~~~~~~"

'''assume that keyword is in dic'''
def get_rhyme_similiarity(keyword, suggestion):
	if not suggestion in dic.keys():
		return 0
	keyword_syl = dic[keyword]
	suggestion_syl = dic[suggestion]

	overlap = len(set(keyword_syl).intersection(suggestion_syl))
	base = overlap / float(len(keyword_syl))
	pair = overlap / float(len(suggestion_syl))
	return base * pair


def suggest_rhyme(keyword):
	if keyword not in dic.keys():
		return []
	suggestions = model.most_similar(keyword, topn=1000)
	suggestion_list = []
	for word, semantic_score in suggestions:
		rhyme_score = get_rhyme_similiarity(keyword, word)
		combined_score = rhyme_score
		if combined_score != 0:
			suggestion_list += [(word, combined_score)]
# Jonathan Monreal

from __future__ import division
import nltk
from nltk.corpus import cmudict as cmu

entries = cmu.entries() # Stores the entries in the CMU dictionary
words = 0               # Stores the number of unique words
multiples = 0           # Stores the number of words with multiple pronunciations

for i in range(0, len(entries)):
    if entries[i-1][1] != entries[i][1]: # if the entry is a new word
        words += 1
    elif entries[i-1] != entries[i-2]:   # else if the word has a second pronunciation
        multiples += 1
        

print 'Words: ' + str(words)
print 'Multiples: ' + str(multiples)
print 'Fraction: ' + str(multiples / words)
from nltk.corpus import cmudict as cmu
entries = cmu.entries()
words = 0
multiples = 0

for i in range(0, len(entries)):
	if entries[i-1][1] != entries[i][1]:
		words += 1
	elif entries[i-1] != entries[i-2]:
		multiples += 1

print 'Words:' + str(words)
print 'multiples:' + str(multiples)
print 'Fraction:' + str(multiples/ words)
Exemple #16
0
from util.benchmarking import Timer
import random
import re
import nltk
import operator
from textblob import TextBlob
from pattern.en import parse
from nltk.corpus import cmudict

e = cmudict.entries()
d = cmudict.dict()

banned_end_words = ['the', 'a', 'an', 'at', 'been', 'in', 'of', 'to', 'by', 'my',
					'too', 'not', 'and', 'but', 'or', 'than', 'then', 'no', 'o',
					'for', 'so', 'which', 'their', 'on', 'your', 'as', 'has',
					'what', 'is', 'nor', 'i', 'this', 'that']

pos_map = {'NNP':'NN', 'NNPS':'NNS', 'NNP-LOC':'NN', 'NNPS-LOC':'NNS'}

## blocks = re.split('\n+', testtext)

def sylcount(s):
	try:
		d[s]
	except KeyError:
		return None
	else:
		if len(d[s]) <= 1:
			sj = ''.join(d[s][0])
			sl = re.split('0|1|2', sj)
			return len(sl) - 1
Exemple #17
0
from nltk.corpus import cmudict as cmu
import os

i = 1
for w in cmu.entries():
    print "insert into CMU.Words (ID, Word,Pronunciation) values (" + str(i) + ",'" + w[0].replace(
        "'", "''"
    ) + "','" + " ".join(w[1]) + "')"
    print "GO"

    # j = 1
    # for s in w[1]:
    #    print "insert into CMU.Syllables (ID,SyllableNumber,Pronunciation) values (" + str(i) + "," + str(j) + ",'" + s.replace("'","''") + "')"
    #    print "GO"
    #    j+=1

    i += 1
Exemple #18
0
fdist1.plot(50, cumulative=True)
#true, most words in text are stop words!!

#11 Investigate the table of modal distributions and look for other patterns. Try to explain them in terms of your own impressionistic understanding of the different genres. Can you find other closed classes of words that exhibit significant differences across different genres?
#conditional frequency distributions
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
# check distribution of 5 w's 1 h
general_words = ["who", "what", "when", "where", "why", "how"]
#conditional frequency distributions with event_words
cfd.tabulate(conditions=genres, samples=general_words)
# most frequent in new is who, when;religion is who, what;hobbies is who, when,etc.

#12 The CMU Pronouncing Dictionary contains multiple pronunciations for certain words. How many distinct words does it contain? What fraction of words in this dictionary have more than one possible pronunciation?
words = [word for word, pron in cmudict.entries()]
wordset = set(words)
cmu = cmudict.dict()
print(len(words))
print(len(wordset))
more_than_one_pron = [word for word in wordset if len(cmu.get(word)) > 1]
print(
    len(more_than_one_pron) / len(wordset) * 100,
    "% words have more than one pronounciation")

#13 What percentage of noun synsets have no hyponyms? You can get all noun synsets using wn.all_synsets('n').
no_hyp_nouns = [
    noun for noun in wn.all_synsets('n') if len(noun.hyponyms()) == 0
]
all_noun_words = [noun for noun in wn.all_synsets('n')]
print("Percentage of noun having no hyponyms: ",
Exemple #19
0
def get_pron(word):
    for entry in cmu.entries():
        if entry[0] == word:
            return entry[1]
    return None
Exemple #20
0
from nltk.corpus import cmudict

words = cmudict.entries()
count = 0

for entry in words:
    if len(entry[1]) > 1:
        count += 1

# Percentage of words with more than one possible pronunciation
print(1.0 * count / len(words))
Exemple #21
0
        for r in self.rhyme_list:
            print r
            print self.rhyme_list[r]

    def __init__(self, word):
        self.rhyme_list = collections.defaultdict(list)
        self.l_word = word
        self.l_pron = rhyming.get_pron(word)
        self.last_syl = rhyming.last_syllable(self.l_pron)

    def last_syllable(self):
        return self.l_pron[self.last_syl :]


test1 = LWord("bash")
test2 = LWord("ababa")
test3 = LWord("sale")
test4 = LWord("blare")
# print "Rhymes with"
count = 0
for i in cmudict.entries()[1:200]:
    count = count + 1
    print count
    print i[0]
    test1.check_rhyme(i[0])
    test2.check_rhyme(i[0])
    test3.check_rhyme(i[0])
    test4.check_rhyme(i[0])

test4.print_rhyming_words()
Exemple #22
0
from nltk.corpus import cmudict
dic = dict(cmudict.entries())
dic_vocab = dic.keys()
import sys
from os import listdir
from os.path import isfile, join
import cPickle as pickle


def getListOfRhymes(lines):
    list_of_rhymes = []

    current_list_of_rhymes = []
    last_stresses = set()
    for line in lines:
        word = line.split()
        if len(word) > 1 and word[-1] in dic_vocab:
            word = word[-1]
            syls = dic[word]
            stresses = set([syl for syl in syls if syl[-1] in '0123456789'])
            if not stresses.intersection(last_stresses):
                list_of_rhymes.append(current_list_of_rhymes)
                current_list_of_rhymes = []
            last_stresses = stresses
            current_list_of_rhymes += [word]
        else:
            list_of_rhymes.append(current_list_of_rhymes)
            current_list_of_rhymes = []
            last_stresses = []
    list_of_rhymes.append(current_list_of_rhymes)
    list_of_rhymes = [
print "writing this up ONLY because the cmudict API turned out to be also VERY STUPID to use"

#import nltk.corpus.cmudict  # this does not work!?!?
from nltk.corpus import cmudict
import string

phoneme_dict = dict(cmudict.entries())

def syllables_in_word(word):
    '''Attempts to count the number of syllables in the string argument 'word'.
    
    Limitation: word must be in the CMU dictionary (but that was a premise of the Exercise)
    "Algorithm": no. syllables == no. (0,1,2) digits in the dictionary entry, right??        
    '''
    
    # although listcomps may be readable, you can't insert print statements to instrument them!!
    if phoneme_dict.has_key(word):   
        #return sum([ phoneme.count(str(num)) for phoneme in phoneme_dict[word] for num in range(3) ])
        return len( [ph for ph in phoneme_dict[word] if ph.strip(string.letters)] )   # more destructive; less efficient? NO! see timeit results in my comments below
    else:        
        return 0                           
    

def syllables_in_text(text):
    '''Attempts to count the number of syllables in the string argument 'text'.
    
    Limitation: any "internal punctuation" must be part of the word. (it wouldn't get "this,and" correctly)
    Lets syllables_in_word do the heavy lifting.
    '''

    # ok, so apparently str.split(delim) only works for A SINGLE CHAR delim...
def generate():
    """Generates random lyrics using Markov Chaining."""
    print "Generating song..."
    line_count = 1
    delimiter = ':'
    parsed = FIRST_COMBO.split(delimiter)
    pos_1 = parsed[0]
    pos_2 = parsed[1]
    cmu_entries = cmudict.entries()
    to_rhyme = "boat"
    output = ""
    while line_count < 41:
        syllable_count = 0
        while syllable_count < AVG_SYL:

            addendum = " "
            current_combo = pos_1 + delimiter + pos_2

            if current_combo in PARTS_OF_SPEECH:
                this_pos = PARTS_OF_SPEECH[current_combo]
            else:
                this_pos = PARTS_OF_SPEECH[pos_1]

            random_word = this_pos.get_random_word()

            try:
                syllable_count += nsyl(random_word)[0]
            except KeyError:
                syllable_count += 1

            pos_1 = this_pos.get_random_pos()
            current_combo = pos_2 + pos_1

            if current_combo in PARTS_OF_SPEECH:
                pos_2 = PARTS_OF_SPEECH[current_combo].get_random_pos()
            else:
                pos_2 = PARTS_OF_SPEECH[pos_1].get_random_pos()

            if syllable_count >= AVG_SYL:
                addendum = "\n"
                # Try to find a rhyme if possible.
                if len(to_rhyme) > 0:
                    random_word = this_pos.find_rhyme(to_rhyme, cmu_entries)
                    to_rhyme = ""
                else:
                    if random_word.find(' ') > -1:
                        to_rhyme = random_word.split(' ')[1]
                    else:
                        to_rhyme = random_word
                if line_count % 8 == 0:
                    parsed = FIRST_COMBO.split(delimiter)
                    pos_1 = parsed[0]
                    pos_2 = parsed[1]
                    addendum = "\n\n"
            output += random_word + addendum

        buffer_val = float(100) / 41
        percentage = "{0:.0f}%".format(buffer_val * line_count)
        print percentage + " complete"
        line_count += 1

    print "\n\nCOMPLETE!! Here is your new Smash Hit:\n"
    print output
Exemple #25
0
elif len(argv) == 5:
    script, book, rhyme_scheme, poem_count, output_format = argv
    show_diagnostics = 'y'
elif len(argv) == 4:
    script, book, rhyme_scheme, poem_count = argv
    output_format = 'pt'
    show_diagnostics = 'y'
elif len(argv) == 3:
    script, book, rhyme_scheme = argv
    poem_count = '10'
    output_format = 'pt'
    show_diagnostics = 'y'
else:
    print "invalid input arguments"

e = cmudict.entries()
d = cmudict.dict()

st = SnowballStemmer("english")

banned_end_words = ['the', 'a', 'an', 'at', 'been', 'in', 'of', 'to', 'by', 'my', 'too', 'not', 
                    'and', 'but', 'or', 'than', 'then', 'no', 'o', 'for', 'so', 'which', 'their', 
                    'on', 'your', 'as', 'has', 'what', 'is', 'nor', 'i', 'that', 'am', 'be', 'and',
                    'with', 'it', 'is', 'will', 'in', 'its', 'of', 'we', 'was', 'were', 'have',
                    'you', 'do', 'had', 'whose', 'while', 'because']

banned_word_combos = [['the', 'and', 'the'], ['at', 'to'], ['around', 'about'], ['the', 'all', 'the'], ['the', 'of', 'the']]

if show_diagnostics.lower() == 'y':
    print "importing source text..."
f = open(book)
data=[["ʃ", "SH", "wish"], ["θ", "TH", "breath"], ["æ", "AE", "sat"], ["ɑ", "AA", "calm"], ["ð", "DH", "breathe"], ["ə", "AX", "southern"], ["ɛ", "EH", "net"], ["ɔ", "AO", "bow"], ["ŋ", "NG", "wing"]]
import nltk
from nltk.corpus import cmudict as cmu
table = data
import nltk
from nltk.corpus import cmudict as cmu
print("The CMU Pronouncing Dictionary contains multiple pronunciations for certain words.")
print("How many distinct words does it contain?")
words = [w[0] for w in cmu.entries()]
distinct_words = set(words)
distinct_word_count = len(set(words))
print(distinct_word_count)
#What fraction of words in this dictionary have more than one possible pronunciation?
dup_words = nltk.FreqDist(words).most_common()
dup_words_count = [d[0] for d in dup_words if d[1] > 1]
return(str(len(dup_words_count)) + ' / ' + str(distinct_word_count))
#Note that if e is an entry in the dictionary, e[0] will give you just the word.  Figure out a way to use set().
import nltk
from nltk.corpus import cmudict as cmu
#use the CMU dictionary to find the number of phones in
to_lookup = ['handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home']
''' You can look up each word in the CMU dictionary, but some may have more than
one pronunciation.  Choose the first one. '''
lookup_with_cmu = {w[0]: w[1] for w in cmu.entries() if w[0] in to_lookup}
counts = [(w, len(lookup_with_cmu[w])) for w in lookup_with_cmu]
return(counts)
Exemple #27
0
def loadCmuDict(path):
    
    for w, p in cmudict.entries():
        cmudict_entries[w] = p
Exemple #28
0
from nltk.corpus import cmudict
for w,proun in cmudict.entries():
	if len(proun) >= 10 and len(w) <= 10:
		print w , proun

    'v' : ['V'],
    'w' : ['W'],
    'x' : ['X'],
    'y' : ['Y'],
    'z' : ['Z']
})


app = Flask(__name__)
app.debug = True
 
def char_range(c1, c2):
    for c in xrange(ord(c1), ord(c2)+1):
        yield chr(c)
 
candidates = [e for e in cmudict.entries()
              if len(e[0]) > MIN_LENGTH 
              and WORD_RE.match(e[0])
              and not any(e[1][0].startswith(c) for c in SOUNDS_LIKE[e[0][0].lower()] )]
separated = {}
for l in char_range('a', 'z'):
    separated[l] = [e for e in candidates if e[0][0].lower() == l]

def get_random_dictionary():
    generated = []
    for l in char_range('a', 'z'):
        word = ("Nothing Yet :(", [])
        if len(separated[l]) > 0:
            word = choice(separated[l])
        generated.append((l, word[0], word[1]))
    return generated
def exercise12():
    words = [a for a, pronounciation in cmudict.entries()]
    distinct_words = set(words)
    percent = 1- (len(distinct_words) / len(words))
    print("distinct words = " + str(len(distinct_words)))
    print("percentage of words with more than one pronounciation: " + str(percent*100))