def _build_wordset(clazz, obscurity_limit): # I'm sorry this method is so disgusting. # It's all in the cause of fast loading in the main case. from nltk import FreqDist # Ensure corpora are loaded. try: from nltk.corpus import cmudict cmudict.entries() except LookupError: print "CMUDict corpus not found. Downloading..." from nltk import download download('cmudict') print "[Done]" if obscurity_limit is not None: from nltk.corpus import brown try: brown.words() except LookupError: print "Brown corpus not found. Downloading...", from nltk import download download('brown') print "[Done]" words = cmudict.entries() if obscurity_limit is not None: freqs = FreqDist([w.lower() for w in brown.words()]) words = sorted(words, key=lambda x: freqs[x[0].lower()], reverse=True) return words[:obscurity_limit] else: return list(words)
def rhyme(inp, level): entries = cmudict.entries() syllables = [(word, syl) for word, syl in entries if word == inp] rhymes = [] for (word, syllable) in syllables: rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]] return set(rhymes)
def get_rhymes(self, word): rhymes = [] word_pronounciations = cmudict.dict()[word] for word_pronounciation in word_pronounciations: for rhyme, rhyme_pronounciation in cmudict.entries(): if rhyme_pronounciation[-1] == word_pronounciation[-1]: rhymes.append(rhyme) return rhymes
def ex12(): from nltk.corpus import cmudict entries = cmudict.entries() words = map(lambda (word, pron) : word, entries) distinct_words = set(words) fd = nltk.FreqDist(words) multi_prons = 0 for key in fd.keys(): if fd[key] == 1: break multi_prons = multi_prons + 1 print "#-distinct words:", len(distinct_words) print "#-words with multiple prons:", multi_prons
def produce_lexical_index(lexicon): """returns a lexical index""" index = [] pronunciations = cmudict.entries() # makes an index consisting of the pronunciations and meanings. for word in lexicon: prons = find_pronunciations(word, pronunciations) meanings = find_meanings(word) index.append((word, prons, meanings)) return index
def loadCmuDict(path): cmudict_entries = {} for word, phonomes in cmudict.entries(): phonomes = deleteStress(phonomes) if not cmudict_entries.has_key(word): # wordがなかった場合(新規追加) phonomes_list = [] phonomes_list.append(phonomes) cmudict_entries[word] = phonomes_list else: # すでにwordが登録済み(強形・弱形)の場合 if not isDuplicated(phonomes, cmudict_entries[word]): cmudict_entries[word].append(phonomes) return cmudict_entries
def rhyme(inp, level): global rhyme_chunks if(not inp.isalpha()): return [] inp=inp.lower() key="" entries=cmudict.entries() syllables = [(word, syl) for word, syl in entries if word == inp] if(len(syllables)>0): key=repr(syllables[0][1][-level:]) if(key in rhyme_chunks): print("Skipping because "+inp+" has a memoized rhyme") return rhyme_chunks[key] else: print("Skipping because "+inp+" has no rhymes") return [] myRhymes = [] for (word, syllable) in syllables: myRhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]] if(len(syllables)>0): rhyme_chunks[key]=list(set(myRhymes)) return list(set(myRhymes))
from nltk.corpus import cmudict interested_words = ["what" , "f**k" , "shit"] interested_prouns = [cmudict.dict()[word][0][-2:] for word in interested_words ] print [word for word , proun in cmudict.entries() if proun[-2:] in interested_prouns]
def ShowReadability(): text.insert(END, "If this doesn't work, check NLTK is installed. If NLTK is installed, use nltk.download() to get cmudict and punkt sentence tokenizer. See Help for details \n\n\n") import nltk pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']''' data = resultsbox.get(1.0,END) rawtext=nltk.regexp_tokenize(data, pattern) prepcolloc = (w.lower() for w in rawtext) text.delete(1.0, END) #sentences sentcountshort = 0 sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(data) for sent in sents: if len(sent) < 2: sentcountshort = sentcountshort+1 numsents = len(sents) - sentcountshort numwords = len(p.split(data))-1 sentcountshort = 0 text.insert(END, "\nIgnoring one word sentences (like numbering), there are ") text.insert(END, numsents) text.insert(END, " sentences with an average of ") averagewordspersentence = numwords/numsents text.insert(END, averagewordspersentence) text.insert(END, " words per sentence.\n\n") #set up syllable dictionary from math import sqrt as squareroot from nltk.corpus import cmudict syllables = dict() numeral = re.compile(r'\d') for (word, phonemes) in cmudict.entries(): word = word.lower() count = len([x for x in list(''.join(phonemes)) if x >= '0' and x <= '9']) if syllables.has_key(word): count = min(count, syllables[word]) syllables[word] = count #count syllables numsyllables=0 wordsnotincmu=dict() for word in prepcolloc: if word in syllables: numsyllables = numsyllables + syllables[word] else: wordsnotincmu[word] = 1 #count three syllable words threesyllcount=0 for word in prepcolloc: if word in syllables and syllables[word] > 2: threesyllcount = threesyllcount + syllables[word] #calculate number of letters and numbers letnumcount=0 for word in rawtext: if word.isalpha(): letnumcount=letnumcount + len(word) #adapted from Java at http://www.editcentral.com/gwt1/EditCentral.html #Flesch Flesch = 206.835 - (1.015 * numwords) / numsents - (84.6 * numsyllables) / numwords Flesch = "%.1f" % Flesch #Automated readability index ARI = (4.71 * letnumcount) / numwords + (0.5 * numwords) / numsents -21.43; ARI = "%.1f" % ARI #Flesch-Kincaid grade level FK = (0.39 * numwords) / numsents + (11.8 * numsyllables) / numwords - 15.59; FK = "%.1f" % FK #Coleman-Liau index CL = (5.89 * letnumcount) / numwords - (30.0 * numsents) / numwords - 15.8; CL = "%.1f" % CL #gunning fog GunningFog = 0.4 * ( numwords / numsents + (100.0 * threesyllcount) / numwords ); GunningFog = "%.1f" %GunningFog #SMOG smog = squareroot( threesyllcount * 30.0 / numsents ) + 3.0; smog = "%.1f" % smog text.insert(END, "Flesch: ") text.insert(END, Flesch) text.insert(END, "\n") text.insert(END, "Automated readability index: ") text.insert(END, ARI) text.insert(END, "\n") text.insert(END, "Flesch-Kincaid grade level: ") text.insert(END, FK) text.insert(END, "\n") text.insert(END, "Coleman-Liau index: ") text.insert(END, CL) text.insert(END, "\n") text.insert(END, "Gunning fog index: ") text.insert(END, GunningFog) text.insert(END, "\n") text.insert(END, "Smog: ") text.insert(END, smog) text.insert(END, "\n\n") text.insert(END, "Following words not included in analysis - syllable count is missing from the cmudict database:\n\n") for k,y in sorted(wordsnotincmu.items()): text.insert(END, k)
# 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY # AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # import cPickle as pickle from nltk.corpus import cmudict import re syllables = dict() numeral = re.compile(r'\d') for (word, phonemes) in cmudict.entries(): word = word.upper() count = len([x for x in list(''.join(phonemes)) if x >= '0' and x <= '9']) if syllables.has_key(word): count = min(count, syllables[word]) syllables[word] = count output = open('cmudict.pickle2', 'wb') pickle.dump(syllables, output, pickle.HIGHEST_PROTOCOL) output.close()
def cmu_multi_pronounciation(): """calculates the proportion of words in the cmu dictionary that has more than 1 pronounciation""" freq = nltk.FreqDist([entry[0] for entry in cmudict.entries()]) multiples = freq.B() - freq.Nr(1) # total_types - 1_pronounciation return multiples * 1.0 / freq.B()
print(conll2007.sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP # for tree in ycoe.parsed_sents('cocuraC')[:4]: # print(tree) # doctest: +SKIP # word lists and lexicons print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS print(stopwords.fileids()) # doctest: +ELLIPSIS print(stopwords.words('portuguese')) # doctest: +ELLIPSIS # nltk.download('names') print(names.fileids()) print(names.words('male.txt')) # doctest: +ELLIPSIS print(names.words('female.txt')) # doctest: +ELLIPSIS # nltk.download('cmudict') print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # Load the entire cmudict corpus into a Python dictionary: transcr = cmudict.dict() print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split() ]) # doctest: +NORMALIZE_WHITESPACE # categorized corpora print(brown.categories()) # doctest: +NORMALIZE_WHITESPACE print(movie_reviews.categories()) # nltk.download('reuters') print(reuters.categories()) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS print(brown.categories('ca01')) print(brown.categories(['ca01', 'cb01'])) print(reuters.categories('training/9865')) print(reuters.categories(['training/9865', 'training/9880'])) print(reuters.fileids('barley')) # doctest: +ELLIPSIS print(brown.tagged_words(categories='news'))
from nltk.corpus import cmudict from gensim.models import Word2Vec dic = dict(cmudict.entries()) model = Word2Vec.load('data/rap.model') print "here we go~~~~~~" '''assume that keyword is in dic''' def get_rhyme_similiarity(keyword, suggestion): if not suggestion in dic.keys(): return 0 keyword_syl = dic[keyword] suggestion_syl = dic[suggestion] overlap = len(set(keyword_syl).intersection(suggestion_syl)) base = overlap / float(len(keyword_syl)) pair = overlap / float(len(suggestion_syl)) return base * pair def suggest_rhyme(keyword): if keyword not in dic.keys(): return [] suggestions = model.most_similar(keyword, topn=1000) suggestion_list = [] for word, semantic_score in suggestions: rhyme_score = get_rhyme_similiarity(keyword, word) combined_score = rhyme_score if combined_score != 0: suggestion_list += [(word, combined_score)]
# Jonathan Monreal from __future__ import division import nltk from nltk.corpus import cmudict as cmu entries = cmu.entries() # Stores the entries in the CMU dictionary words = 0 # Stores the number of unique words multiples = 0 # Stores the number of words with multiple pronunciations for i in range(0, len(entries)): if entries[i-1][1] != entries[i][1]: # if the entry is a new word words += 1 elif entries[i-1] != entries[i-2]: # else if the word has a second pronunciation multiples += 1 print 'Words: ' + str(words) print 'Multiples: ' + str(multiples) print 'Fraction: ' + str(multiples / words)
from nltk.corpus import cmudict as cmu entries = cmu.entries() words = 0 multiples = 0 for i in range(0, len(entries)): if entries[i-1][1] != entries[i][1]: words += 1 elif entries[i-1] != entries[i-2]: multiples += 1 print 'Words:' + str(words) print 'multiples:' + str(multiples) print 'Fraction:' + str(multiples/ words)
from util.benchmarking import Timer import random import re import nltk import operator from textblob import TextBlob from pattern.en import parse from nltk.corpus import cmudict e = cmudict.entries() d = cmudict.dict() banned_end_words = ['the', 'a', 'an', 'at', 'been', 'in', 'of', 'to', 'by', 'my', 'too', 'not', 'and', 'but', 'or', 'than', 'then', 'no', 'o', 'for', 'so', 'which', 'their', 'on', 'your', 'as', 'has', 'what', 'is', 'nor', 'i', 'this', 'that'] pos_map = {'NNP':'NN', 'NNPS':'NNS', 'NNP-LOC':'NN', 'NNPS-LOC':'NNS'} ## blocks = re.split('\n+', testtext) def sylcount(s): try: d[s] except KeyError: return None else: if len(d[s]) <= 1: sj = ''.join(d[s][0]) sl = re.split('0|1|2', sj) return len(sl) - 1
from nltk.corpus import cmudict as cmu import os i = 1 for w in cmu.entries(): print "insert into CMU.Words (ID, Word,Pronunciation) values (" + str(i) + ",'" + w[0].replace( "'", "''" ) + "','" + " ".join(w[1]) + "')" print "GO" # j = 1 # for s in w[1]: # print "insert into CMU.Syllables (ID,SyllableNumber,Pronunciation) values (" + str(i) + "," + str(j) + ",'" + s.replace("'","''") + "')" # print "GO" # j+=1 i += 1
fdist1.plot(50, cumulative=True) #true, most words in text are stop words!! #11 Investigate the table of modal distributions and look for other patterns. Try to explain them in terms of your own impressionistic understanding of the different genres. Can you find other closed classes of words that exhibit significant differences across different genres? #conditional frequency distributions cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] # check distribution of 5 w's 1 h general_words = ["who", "what", "when", "where", "why", "how"] #conditional frequency distributions with event_words cfd.tabulate(conditions=genres, samples=general_words) # most frequent in new is who, when;religion is who, what;hobbies is who, when,etc. #12 The CMU Pronouncing Dictionary contains multiple pronunciations for certain words. How many distinct words does it contain? What fraction of words in this dictionary have more than one possible pronunciation? words = [word for word, pron in cmudict.entries()] wordset = set(words) cmu = cmudict.dict() print(len(words)) print(len(wordset)) more_than_one_pron = [word for word in wordset if len(cmu.get(word)) > 1] print( len(more_than_one_pron) / len(wordset) * 100, "% words have more than one pronounciation") #13 What percentage of noun synsets have no hyponyms? You can get all noun synsets using wn.all_synsets('n'). no_hyp_nouns = [ noun for noun in wn.all_synsets('n') if len(noun.hyponyms()) == 0 ] all_noun_words = [noun for noun in wn.all_synsets('n')] print("Percentage of noun having no hyponyms: ",
def get_pron(word): for entry in cmu.entries(): if entry[0] == word: return entry[1] return None
from nltk.corpus import cmudict words = cmudict.entries() count = 0 for entry in words: if len(entry[1]) > 1: count += 1 # Percentage of words with more than one possible pronunciation print(1.0 * count / len(words))
for r in self.rhyme_list: print r print self.rhyme_list[r] def __init__(self, word): self.rhyme_list = collections.defaultdict(list) self.l_word = word self.l_pron = rhyming.get_pron(word) self.last_syl = rhyming.last_syllable(self.l_pron) def last_syllable(self): return self.l_pron[self.last_syl :] test1 = LWord("bash") test2 = LWord("ababa") test3 = LWord("sale") test4 = LWord("blare") # print "Rhymes with" count = 0 for i in cmudict.entries()[1:200]: count = count + 1 print count print i[0] test1.check_rhyme(i[0]) test2.check_rhyme(i[0]) test3.check_rhyme(i[0]) test4.check_rhyme(i[0]) test4.print_rhyming_words()
from nltk.corpus import cmudict dic = dict(cmudict.entries()) dic_vocab = dic.keys() import sys from os import listdir from os.path import isfile, join import cPickle as pickle def getListOfRhymes(lines): list_of_rhymes = [] current_list_of_rhymes = [] last_stresses = set() for line in lines: word = line.split() if len(word) > 1 and word[-1] in dic_vocab: word = word[-1] syls = dic[word] stresses = set([syl for syl in syls if syl[-1] in '0123456789']) if not stresses.intersection(last_stresses): list_of_rhymes.append(current_list_of_rhymes) current_list_of_rhymes = [] last_stresses = stresses current_list_of_rhymes += [word] else: list_of_rhymes.append(current_list_of_rhymes) current_list_of_rhymes = [] last_stresses = [] list_of_rhymes.append(current_list_of_rhymes) list_of_rhymes = [
print "writing this up ONLY because the cmudict API turned out to be also VERY STUPID to use" #import nltk.corpus.cmudict # this does not work!?!? from nltk.corpus import cmudict import string phoneme_dict = dict(cmudict.entries()) def syllables_in_word(word): '''Attempts to count the number of syllables in the string argument 'word'. Limitation: word must be in the CMU dictionary (but that was a premise of the Exercise) "Algorithm": no. syllables == no. (0,1,2) digits in the dictionary entry, right?? ''' # although listcomps may be readable, you can't insert print statements to instrument them!! if phoneme_dict.has_key(word): #return sum([ phoneme.count(str(num)) for phoneme in phoneme_dict[word] for num in range(3) ]) return len( [ph for ph in phoneme_dict[word] if ph.strip(string.letters)] ) # more destructive; less efficient? NO! see timeit results in my comments below else: return 0 def syllables_in_text(text): '''Attempts to count the number of syllables in the string argument 'text'. Limitation: any "internal punctuation" must be part of the word. (it wouldn't get "this,and" correctly) Lets syllables_in_word do the heavy lifting. ''' # ok, so apparently str.split(delim) only works for A SINGLE CHAR delim...
def generate(): """Generates random lyrics using Markov Chaining.""" print "Generating song..." line_count = 1 delimiter = ':' parsed = FIRST_COMBO.split(delimiter) pos_1 = parsed[0] pos_2 = parsed[1] cmu_entries = cmudict.entries() to_rhyme = "boat" output = "" while line_count < 41: syllable_count = 0 while syllable_count < AVG_SYL: addendum = " " current_combo = pos_1 + delimiter + pos_2 if current_combo in PARTS_OF_SPEECH: this_pos = PARTS_OF_SPEECH[current_combo] else: this_pos = PARTS_OF_SPEECH[pos_1] random_word = this_pos.get_random_word() try: syllable_count += nsyl(random_word)[0] except KeyError: syllable_count += 1 pos_1 = this_pos.get_random_pos() current_combo = pos_2 + pos_1 if current_combo in PARTS_OF_SPEECH: pos_2 = PARTS_OF_SPEECH[current_combo].get_random_pos() else: pos_2 = PARTS_OF_SPEECH[pos_1].get_random_pos() if syllable_count >= AVG_SYL: addendum = "\n" # Try to find a rhyme if possible. if len(to_rhyme) > 0: random_word = this_pos.find_rhyme(to_rhyme, cmu_entries) to_rhyme = "" else: if random_word.find(' ') > -1: to_rhyme = random_word.split(' ')[1] else: to_rhyme = random_word if line_count % 8 == 0: parsed = FIRST_COMBO.split(delimiter) pos_1 = parsed[0] pos_2 = parsed[1] addendum = "\n\n" output += random_word + addendum buffer_val = float(100) / 41 percentage = "{0:.0f}%".format(buffer_val * line_count) print percentage + " complete" line_count += 1 print "\n\nCOMPLETE!! Here is your new Smash Hit:\n" print output
elif len(argv) == 5: script, book, rhyme_scheme, poem_count, output_format = argv show_diagnostics = 'y' elif len(argv) == 4: script, book, rhyme_scheme, poem_count = argv output_format = 'pt' show_diagnostics = 'y' elif len(argv) == 3: script, book, rhyme_scheme = argv poem_count = '10' output_format = 'pt' show_diagnostics = 'y' else: print "invalid input arguments" e = cmudict.entries() d = cmudict.dict() st = SnowballStemmer("english") banned_end_words = ['the', 'a', 'an', 'at', 'been', 'in', 'of', 'to', 'by', 'my', 'too', 'not', 'and', 'but', 'or', 'than', 'then', 'no', 'o', 'for', 'so', 'which', 'their', 'on', 'your', 'as', 'has', 'what', 'is', 'nor', 'i', 'that', 'am', 'be', 'and', 'with', 'it', 'is', 'will', 'in', 'its', 'of', 'we', 'was', 'were', 'have', 'you', 'do', 'had', 'whose', 'while', 'because'] banned_word_combos = [['the', 'and', 'the'], ['at', 'to'], ['around', 'about'], ['the', 'all', 'the'], ['the', 'of', 'the']] if show_diagnostics.lower() == 'y': print "importing source text..." f = open(book)
data=[["ʃ", "SH", "wish"], ["θ", "TH", "breath"], ["æ", "AE", "sat"], ["ɑ", "AA", "calm"], ["ð", "DH", "breathe"], ["ə", "AX", "southern"], ["ɛ", "EH", "net"], ["ɔ", "AO", "bow"], ["ŋ", "NG", "wing"]] import nltk from nltk.corpus import cmudict as cmu table = data import nltk from nltk.corpus import cmudict as cmu print("The CMU Pronouncing Dictionary contains multiple pronunciations for certain words.") print("How many distinct words does it contain?") words = [w[0] for w in cmu.entries()] distinct_words = set(words) distinct_word_count = len(set(words)) print(distinct_word_count) #What fraction of words in this dictionary have more than one possible pronunciation? dup_words = nltk.FreqDist(words).most_common() dup_words_count = [d[0] for d in dup_words if d[1] > 1] return(str(len(dup_words_count)) + ' / ' + str(distinct_word_count)) #Note that if e is an entry in the dictionary, e[0] will give you just the word. Figure out a way to use set(). import nltk from nltk.corpus import cmudict as cmu #use the CMU dictionary to find the number of phones in to_lookup = ['handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home'] ''' You can look up each word in the CMU dictionary, but some may have more than one pronunciation. Choose the first one. ''' lookup_with_cmu = {w[0]: w[1] for w in cmu.entries() if w[0] in to_lookup} counts = [(w, len(lookup_with_cmu[w])) for w in lookup_with_cmu] return(counts)
def loadCmuDict(path): for w, p in cmudict.entries(): cmudict_entries[w] = p
from nltk.corpus import cmudict for w,proun in cmudict.entries(): if len(proun) >= 10 and len(w) <= 10: print w , proun
'v' : ['V'], 'w' : ['W'], 'x' : ['X'], 'y' : ['Y'], 'z' : ['Z'] }) app = Flask(__name__) app.debug = True def char_range(c1, c2): for c in xrange(ord(c1), ord(c2)+1): yield chr(c) candidates = [e for e in cmudict.entries() if len(e[0]) > MIN_LENGTH and WORD_RE.match(e[0]) and not any(e[1][0].startswith(c) for c in SOUNDS_LIKE[e[0][0].lower()] )] separated = {} for l in char_range('a', 'z'): separated[l] = [e for e in candidates if e[0][0].lower() == l] def get_random_dictionary(): generated = [] for l in char_range('a', 'z'): word = ("Nothing Yet :(", []) if len(separated[l]) > 0: word = choice(separated[l]) generated.append((l, word[0], word[1])) return generated
def exercise12(): words = [a for a, pronounciation in cmudict.entries()] distinct_words = set(words) percent = 1- (len(distinct_words) / len(words)) print("distinct words = " + str(len(distinct_words))) print("percentage of words with more than one pronounciation: " + str(percent*100))