def multi_words_xpn(self): mwes = [] bigram_measures = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(gw.words('english-web.txt')) finder.apply_freq_filter(self.FILTERING_NUM) mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM)) trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = nltk.collocations.TrigramCollocationFinder.from_words(gw.words('english-web.txt')) finder.apply_freq_filter(self.FILTERING_NUM) mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM)) return mwes
def create_example_data(): import nltk try: os.listdir(nltk.data.find('genesis')) from nltk.corpus import genesis as dataset #print(path_to) #from nltk.corpus import genesis as dataset except: #try: #import nltk nltk.download('genesis') #quit() from nltk.corpus import genesis as dataset #except Exception as e: # print(e) # raise EnvironmentError("For Genesis toy data from NLTK you need the Internet access to download it.") languages = [ "finnish", "german", "portuguese", "english", "french", "swedish" ] corpus_words = { "finnish": list(dataset.words('finnish.txt')), "german": list(dataset.words('german.txt')), "portuguese": list(dataset.words('portuguese.txt')), "english": list(dataset.words('english-web.txt')), "french": list(dataset.words('french.txt')), "swedish": list(dataset.words('swedish.txt')) } return corpus_words
def main(): # store word lengths brown_word_lens = [] web_word_lens = [] inaugural_word_lens = [] gutenberg_word_lens = [] genesis_word_lens = [] for file in gutenberg.fileids(): for word in gutenberg.words(file): gutenberg_word_lens.append(len(word)) for file in brown.fileids(): for word in brown.words(file): brown_word_lens.append(len(word)) for file in webtext.fileids(): for word in webtext.words(file): web_word_lens.append(len(word)) for file in inaugural.fileids(): for word in inaugural.words(file): inaugural_word_lens.append(len(word)) for file in genesis.fileids(): for word in genesis.words(file): genesis_word_lens.append(len(word)) with open("wordlens.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), len(web_word_lens), len(brown_word_lens), len(gutenberg_word_lens))): for corpus in [genesis_word_lens, inaugural_word_lens, web_word_lens, brown_word_lens, gutenberg_word_lens]: if(i >= len(corpus)): f.write(",") else: f.write(str(corpus[i]) + ",") f.write("\n")
def main(): # store word lengths brown_common_freq = [] web_common_freq = [] inaugural_common_freq = [] gutenberg_common_freq = [] genesis_common_freq = [] common = ["the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"] common.sort() for file in gutenberg.fileids(): total_words = len(gutenberg.words(file)) total_common = 0 for word in gutenberg.words(file): if word.lower() in common: total_common += 1 gutenberg_common_freq.append(float(total_common)/total_words) for file in brown.fileids(): total_words = len(brown.words(file)) total_common = 0 for word in brown.words(file): if word.lower() in common: total_common += 1 brown_common_freq.append(float(total_common)/total_words) for file in webtext.fileids(): total_words = len(webtext.words(file)) total_common = 0 for word in webtext.words(file): if word.lower() in common: total_common += 1 web_common_freq.append(float(total_common)/total_words) for file in inaugural.fileids(): total_words = len(inaugural.words(file)) total_common = 0 for word in inaugural.words(file): if word.lower() in common: total_common += 1 inaugural_common_freq.append(float(total_common)/total_words) for file in genesis.fileids(): total_words = len(genesis.words(file)) total_common = 0 for word in genesis.words(file): if word.lower() in common: total_common += 1 genesis_common_freq.append(float(total_common)/total_words) with open("common-words.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq), len(web_common_freq), len(brown_common_freq), len(gutenberg_common_freq))): for corpus in [genesis_common_freq, inaugural_common_freq, web_common_freq, brown_common_freq, gutenberg_common_freq]: if i >= len(corpus): f.write(",") else: f.write(str(round(corpus[i], 5)) + ",") f.write("\n")
# ◑ Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word. Do the same thing with the Lancaster Stemmer and see if you observe any differences. import nltk from nltk.corpus import genesis text = genesis.words() porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() for word in text: print(word) print("porter: " + porter.stem(word)) print("lancaster: " + lancaster.stem(word))
from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading ptext1, ... and psent1, ..." print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)") print "ptext1:", ptext1.name.decode('latin-1') ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)") print "ptext2:", ptext2.name.decode('latin-1') ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis") print "ptext3:", ptext3.name.decode('latin-1') ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)") print "ptext4:", ptext4.name.decode('latin-1') def texts(): print "ptext1:", ptext1.name.decode('latin-1') print "ptext2:", ptext2.name.decode('latin-1') print "ptext3:", ptext3.name.decode('latin-1') print "ptext4:", ptext4.name.decode('latin-1') psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split(
from __future__ import division def lexical_diversity(text): return len(text) / len(set(text)) def lexical_diversity_multiline(text): word_count = len(text) vocab_size = len(set(text)) diversity_score = vocab_size / word_count return diversity_score from nltk.corpus import genesis kjv = genesis.words('english-kjv.txt') lexical_diversity_multiline(kjv) # In[72]: def plural(word): if word.endswith('y'): return word[:-1] + 'ies' elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']: return word + 'es' elif word.endswith('an'): return word[:-2] + 'en' else: return word + 's'
#Q20 words = ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd', 'b'] fd = FreqDist(words) length = len(set(fd)) answer = list(fd.most_common(length)) answer = [i[0] for i in answer] print(answer) #['b', 'a', 'c', 'd'] #Q21 from nltk.corpus import genesis print(set(genesis.words()).difference(['writing', 'another', 'random', 'sentence'])) #Yes, I am able to do that. #Q22 from operator import itemgetter words = ['this', 'is', 'my', 'list', 'of', 'words'] sorted(words, key=itemgetter(1)) # ['of', 'this', 'list', 'words', 'is', 'my'] sorted(words, key=itemgetter(-1)) # ['of', 'this', 'is', 'words', 'list', 'my'] #operator.itemgetter(n) constructs a callable that assumes iterable object (list, tuple, set) as input an fetches n-th element out of it.
from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading text1, ..., text9 and sent1, ..., sent9" print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." text1 = Text(gutenberg.words('melville-moby_dick.txt')) print "text1:", text1.name text2 = Text(gutenberg.words('austen-sense.txt')) print "text2:", text2.name text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print "text3:", text3.name text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print "text4:", text4.name text5 = Text(nps_chat.words(), name="Chat Corpus") print "text5:", text5.name text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print "text6:", text6.name text7 = Text(treebank.words(), name="Wall Street Journal") print "text7:", text7.name
from nltk.corpus import (gutenberg, genesis, inaugural, nps_chat, webtext, treebank, wordnet) from nltk.text import Text print("*** Introductory Examples for the NLTK Book ***") print("Loading text1, ..., text9 and sent1, ..., sent9") print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words('melville-moby_dick.txt')) print("text1:", text1.name) text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite")) c = conn.cursor() with open('wofkov_db_schema.sql', 'r') as sql: commands = sql.read().split(';') for command in commands: c.execute(command) print "Building clean words list..." words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")] words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) print "Building clean sentences list" sentences = [] for s in brown.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in treebank.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in abc.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in movie_reviews.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in genesis.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()? from nltk.corpus import genesis print(set(genesis.words()).difference(['this', 'is', 'my', 'vocabulary', 'lookee']))
from __future__ import division def lexical_diversity(my_text_data): word_count = len(my_text_data) vocab_size = len(set(my_text_data)) diversity_score = vocab_size / word_count return diversity_score t = "This is a test" lexical_diversity(t) from nltk.corpus import genesis lexical_diversity(genesis.words('english-kjv.txt')) #%% # WordNet # Let's find synonyms from nltk.corpus import wordnet as wn wn.synsets('motorcar') wn.synset('car.n.01').lemma_names() wn.synsets('dish') #%% #Word Hierarchy motorcar = wn.synset('car.n.01')
from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading ptext1, ... and psent1, ..." print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." ptext1 = Text(machado.words("romance/marm05.txt"), name="Memórias Póstumas de Brás Cubas (1881)") print "ptext1:", ptext1.name.decode("latin-1") ptext2 = Text(machado.words("romance/marm08.txt"), name="Dom Casmurro (1899)") print "ptext2:", ptext2.name.decode("latin-1") ptext3 = Text(genesis.words("portuguese.txt"), name="Gênesis") print "ptext3:", ptext3.name.decode("latin-1") ptext4 = Text(mac_morpho.words("mu94se01.txt"), name="Folha de Sao Paulo (1994)") print "ptext4:", ptext4.name.decode("latin-1") def texts(): print "ptext1:", ptext1.name.decode("latin-1") print "ptext2:", ptext2.name.decode("latin-1") print "ptext3:", ptext3.name.decode("latin-1") print "ptext4:", ptext4.name.decode("latin-1") psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split() psent2 = "Não consultes dicionários .".split()
# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()? from nltk.corpus import genesis print( set(genesis.words()).difference( ['this', 'is', 'my', 'vocabulary', 'lookee']))
def main(): #store FreqDist's #index is the length of the word, 0 is for all words samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" brown_letters = FreqDist() web_letters = FreqDist() inaugural_letters = FreqDist() gutenberg_letters = FreqDist() genesis_letters = FreqDist() for file in gutenberg.fileids(): for word in gutenberg.words(file): for character in word: if(character in string.letters): gutenberg_letters[character.upper()] += 1 for file in brown.fileids(): for word in brown.words(file): for character in word: if(character in string.letters): brown_letters[character.upper()] += 1 for file in webtext.fileids(): for word in webtext.words(file): for character in word: if(character in string.letters): web_letters[character.upper()] += 1 for file in inaugural.fileids(): for word in inaugural.words(file): for character in word: if(character in string.letters): inaugural_letters[character.upper()] += 1 for file in genesis.fileids(): for word in genesis.words(file): for character in word: if(character in string.letters): genesis_letters[character.upper()] += 1 with open("genesis-letter-freq.txt",'w') as f: sys.stdout = f f.write("GENESIS\n") for let in samples: print(str(genesis_letters[let])) with open("gutenberg-letter-freq.txt", 'w') as f: sys.stdout = f f.write("GUTENBERG\n") for let in samples: print(str(gutenberg_letters[let])) with open("webtext-letter-freq.txt", 'w') as f: sys.stdout = f f.write("WEBTEXT\n") for let in samples: print(str(web_letters[let])) with open("inaugural-letter-freq.txt", 'w') as f: sys.stdout = f f.write("INAUGURAL\n") for let in samples: print(str(inaugural_letters[let])) with open("brown-letter-freq.txt", 'w') as f: sys.stdout = f f.write("BROWN\n") for let in samples: print(str(brown_letters[let])) with open("letter-freq.txt", 'w') as f: corpora = [gutenberg_letters, web_letters, inaugural_letters, brown_letters, genesis_letters] f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n") for let in samples: for corpus in corpora: f.write(str(corpus[let]) + ",") f.write("\n")
from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading text1, ..., text9 and sent1, ..., sent9" print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." text1 = Text(gutenberg.words('melville-moby_dick.txt')) print "text1:", text1.name text2 = Text(gutenberg.words('austen-sense.txt')) print "text2:", text2.name text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print "text3:", text3.name text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print "text4:", text4.name text5 = Text(nps_chat.words(), name="Chat Corpus") print "text5:", text5.name text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print "text6:", text6.name text7 = Text(treebank.words(), name="Wall Street Journal") print "text7:", text7.name text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading ptext1, ... and psent1, ..." print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)") print "ptext1:", ptext1.name.decode('latin-1') ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)") print "ptext2:", ptext2.name.decode('latin-1') ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis") print "ptext3:", ptext3.name.decode('latin-1') ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)") print "ptext4:", ptext4.name.decode('latin-1') def texts(): print "ptext1:", ptext1.name.decode('latin-1') print "ptext2:", ptext2.name.decode('latin-1') print "ptext3:", ptext3.name.decode('latin-1') print "ptext4:", ptext4.name.decode('latin-1') psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split() psent2 = "Não consultes dicionários .".split() psent3 = "No princípio, criou Deus os céus e a terra.".split() psent4 = "A Cáritas acredita que outros cubanos devem chegar ao Brasil .".split()
cdf.plot() #%% from __future__ import division def lexical_diversity(my_text_data): word_count = len(my_text_data) vocab_size = len(set(my_text_data)) diversity_score = vocab_size / word_count return diversity_score t="This is a test" lexical_diversity(t) from nltk.corpus import genesis lexical_diversity(genesis.words('english-kjv.txt')) #%% # WordNet # Let's find synonyms from nltk.corpus import wordnet as wn wn.synsets('motorcar') wn.synset('car.n.01').lemma_names() wn.synsets('dish') #%% #Word Hierarchy motorcar=wn.synset('car.n.01')
NUM_INTERVALS = 10 if __name__ == '__main__': if len(sys.argv) < 2: print( 'Usage: {} <word2vec-model> [<min-len = {}> [<max-len = {}> [<num-intervals = {}>]]]' .format(sys.argv[0], MIN_LEN, MAX_LEN, NUM_INTERVALS)) exit() model = sys.argv[1] minLen = int(sys.argv[2]) if len(sys.argv) > 2 else MIN_LEN maxLen = int(sys.argv[3]) if len(sys.argv) > 3 else MAX_LEN numIntervals = int(sys.argv[4]) if len(sys.argv) > 4 else NUM_INTERVALS text = genesis.words(fileids='english-kjv.txt') feat = textutils.text2mat(text, model) start = time.time() intervals = maxdiv(feat, method='gaussian_cov', mode='TS', extint_min_len=minLen, extint_max_len=maxLen, num_intervals=numIntervals) stop = time.time() print( 'The search for anomalous paragraphs in a text of {} words took {} seconds.' .format(len(text), stop - start)) textutils.printDetectedParagraphs(text, intervals)
from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print("*** Introductory Examples for the NLTK Book ***") print("Loading text1, ..., text9 and sent1, ..., sent9") print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words('melville-moby_dick.txt')) print("text1:", text1.name) text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
def text3(): text = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis") print("text3:", text.name) return text
from nltk.text import Text from nltk.probability import FreqDist from nltk.util import bigrams print("*** Introductory Examples for the NLTK Book ***") print("Loading text1, ..., text9 and sent1, ..., sent9") print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words("melville-moby_dick.txt")) print("text1:", text1.name) text2 = Text(gutenberg.words("austen-sense.txt")) print("text2:", text2.name) text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
for i in self._index[key]: lcontext = ' '.join(self._text[i - wc:i]) rcontext = ' '.join(self._text[i:i + wc]) ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width) rdisplay = '{:{width}}'.format(rcontext[:width], width=width) print(ldisplay, rdisplay) def _stem(self, word): return self._stemmer.stem(word).lower() porter = nltk.PorterStemmer() grail = nltk.corpus.webtext.words('grail.txt') text = genesis.words() def sem_index(text): word_with_syns = [] # iterate over every word in the text for word in text: # synsets are equal to all the synsets for the word synsets = wn.synsets(word) syns_indices = [] # for every synset in the synset grouping for synset in synsets: # set the index number equal to its offset sem_index_num = synset.offset() syns_indices += [sem_index_num] if syns_indices: