-
Notifications
You must be signed in to change notification settings - Fork 0
/
2.py
447 lines (389 loc) · 18.9 KB
/
2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 16 18:22:01 2018
@author: maciej
"""
from nltk.corpus import gutenberg
from nltk import Text
gutenberg.fileids()
emma=gutenberg.words('austen-emma.txt')
emma=Text(emma)
emma.concordance("surprize")
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents=len(gutenberg.sents(fileid))
num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
print((num_chars/num_words, num_words/num_sents,num_words/num_vocab, fileid))
from nltk.corpus import webtext
for fileid in webtext.fileids():
print((fileid, webtext.raw(fileid)[:65]))
from nltk.corpus import brown
from nltk import FreqDist
brown.categories()
news_text=brown.words(categories="news")
gov=brown.words(categories='government')
fdist=FreqDist([w.lower() for w in news_text])
fdist_gov=FreqDist([w.lower() for w in gov])
modals=["can","could", "may", "might","must", "will"]
for m in modals:
print(m+': '+str(fdist_gov[m]/fdist_gov[modals[0]])+" "+str(fdist[m]/fdist_gov[modals[0]]))
from nltk import ConditionalFreqDist
cfd=ConditionalFreqDist((genre, word) for genre in brown.categories()
for word in [word.lower() for word in brown.words(categories=genre)])
days=['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
cfd.tabulate(conditions=['news', 'romance'], samples=days)
cfd.
sent="In the beginning God created the heaven and the earth".split(sep=" ")+["."]
from nltk import bigrams
list(bigrams(sent))
#random text generator
import random
def generate_model(cfdist, word, num=15):
for i in range(num):
print(word, end=' ')
l1=list(cfdist[word].keys())
l2=list(cfdist[word].values())
temp=[]
for i in range(len(l1)):
temp=temp+[l1[i]]*l2[i]
word=random.choice(temp)
print(' ')
from nltk.corpus import genesis
text=genesis.words('english-kjv.txt')
bigram=bigrams(text)
cfd=ConditionalFreqDist(bigram)
generate_model(cfd, 'living', num=25)
import nltk
def unusual_words(text):
text_vocab=set(w.lower() for w in text if w.isalpha())
english_vocab=set(w.lower() for w in nltk.corpus.words.words())
#unusual=[w for w in text_vocab if w not in english_vocab]
unusual=text_vocab.difference(english_vocab)
return sorted(unusual)
un=unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))
from nltk.corpus import stopwords
stopwords.words('english')
def content_fraction(text):
stopwords_temp=nltk.corpus.stopwords.words('english')
l1=[w.lower() for w in text if w.isalpha()]
l2=[w.lower() for w in text if w.isalpha() and w not in stopwords_temp]
return len(l2)/len(l1)
print(content_fraction(nltk.corpus.reuters.words()))
import itertools
def how_many_words(letter_list, obligatory_letter, min_length=4):
leng=min_length
english_vocab=set(w.lower() for w in nltk.corpus.words.words())
full=[]
while leng!=0:
temp_let=[list(i)+[obligatory_letter] for i in itertools.combinations(letter_list, leng-1)]
temp_words=list(set([w for w in ["".join(j) for i in temp_let for j in itertools.permutations(i)] if w in english_vocab]))
full=full+temp_words
leng=leng+1 if len(temp_words)!=0 else 0
return full
wrds=how_many_words(['e','g','i','v','v','o','n','l'], 'r', min_length=3)
names=nltk.corpus.names
names.fileids()
[w for w in names.words('male.txt') if w in names.words('female.txt')]
cfd=nltk.ConditionalFreqDist([(fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid)])
cfd.plot()
entries=nltk.corpus.cmudict.entries()
len(entries)
for entry in entries[100:110]:
print(entry)
for word, pron in entries:
if len(pron)==3:
p1,p2,p3=pron
if p1=='P' and p3=='T':
print(word, pron)
from nltk.corpus import swadesh
swadesh.fileids()
swadesh.words('en')
swadesh.entries(['pl', 'en'])
translate=dict(swadesh.entries(['pl', 'en']))
translate['dąć']
from nltk.corpus import toolbox
toolbox.entries('rotokas.dic')
from nltk.corpus import wordnet as wn
temp=wn.synsets('motorcar')
temp[0].definition()
for syn in wn.synsets('dish'):
print(syn.hyponyms())
nltk.app.wordnet()
right=wn.synset('right_whale.n.01')
orca=wn.synset('orca.n.01')
minke=wn.synset('minke_whale.n.01')
tortoise=wn.synset('tortoise.n.01')
novel=wn.synset('novel.n.01')
right.lowest_common_hypernyms(minke)
right.lowest_common_hypernyms(orca)[0].min_depth()
right.path_similarity(minke)
#2
from nltk.corpus import gutenberg
gutenberg.fileids()
gutenberg.words('austen-emma.txt')
# word tokens
len([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])
#words
len(list(set([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])))
#3
from nltk.corpus import brown
brown.categories()
brown.words(categories='science_fiction')
#4
from nltk.corpus import state_union
state_union.fileids()
words=['men', 'women', 'people']
from nltk import ConditionalFreqDist
cfd=ConditionalFreqDist([(word, fileid) for fileid in state_union.fileids() for word in [w for w in state_union.words(fileid)]])
cfd.plot(conditions=words)
#5
word='life'
from nltk.corpus import wordnet as wn
for syn in wn.synsets(word):
for mer in syn.part_meronyms():
print("Synset '{2}':\n\t{0}\n\npart meronym '{1}':\n\t{3} ".format(syn.definition(),
mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
for mer in syn.member_meronyms():
print("Synset '{2}':\n\t{0}\n\nmember meronym '{1}':\n\t{3} ".format(syn.definition(),
mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
for mer in syn.substance_meronyms():
print("Synset '{2}':\n\t{0}\n\nsubstance meronym '{1}':\n\t{3} ".format(syn.definition(),
mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
for mer in syn.member_holonyms():
print("Synset '{2}':\n\t{0}\n\nmember holonym '{1}':\n\t{3} ".format(syn.definition(),
mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
for mer in syn.part_holonyms():
print("Synset '{2}':\n\t{0}\n\npart holonym '{1}':\n\t{3} ".format(syn.definition(),
mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
for mer in syn.substance_holonyms():
print("Synset '{2}':\n\t{0}\n\nsubstance holonym '{1}':\n\t{3} ".format(syn.definition(),
mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
#8
from nltk.corpus import names
names.fileids()
tble=[(gender, first_letter) for gender in names.fileids() for first_letter in [w[0] for w in names.words(gender)]]
cfd=nltk.ConditionalFreqDist(tble)
cfd.plot()
#12
from nltk.corpus import cmudict
words=[a for a,b in cmudict.entries()]
len(set(words))-len(words)
i=0
m_words=words.copy()
for word in list(set(words)):
print(i)
i=i+1
m_words.remove(word)
len(m_words)/len(words)
#13
from nltk.corpus import wordnet as wn
len(list(wn.all_synsets('n')))
no_hyph=[syn for syn in wn.all_synsets('n') if (not syn.hyponyms())]
len(no_hyph)/len(list(wn.all_synsets('n')))
#14
"""Define a function supergloss(s) that takes a synset s as its argument and returns
a string consisting of the concatenation of the definition of s , and the definitions
of all the hypernyms and hyponyms of s ."""
def supergloss(s):
string=[]
string.append(s.definition())
string.append("\n\n")
for hyper in s.hypernyms():
string.append(hyper.definition())
string.append("\n")
string.append("\n")
for hypo in s.hyponyms():
string.append(hypo.definition())
string.append("\n")
return "".join(string)
print(supergloss(wn.synsets("car")[0]))
#15
"""Write a program to find all words that occur at least three times in the Brown
Corpus."""
from nltk.corpus import brown
from nltk import FreqDist
lbrown=[w.lower() for w in brown.words() if w.isalpha()]
fd=FreqDist(lbrown)
frequent_words=list(set([word for word in lbrown if fd[word]>=3]))
#16
"""Write a program to generate a table of lexical diversity scores (i.e., token/type
ratios), as we saw in Table 1-1. Include the full set of Brown Corpus genres
( nltk.corpus.brown.categories() ). Which genre has the lowest diversity (greatest
number of tokens per type)? Is this what you would have expected?"""
from nltk.corpus import brown
print("{0}:\t{1}\t{2}\t{3}\n".format("Category", "tokens", "word_types", "lexical diversity"))
for cat in brown.categories():
tokens=len(brown.words(categories=cat))
word_types=len(list(set([w.lower() for w in brown.words(categories=cat) if w.isalpha()])))
print("{0:15s}:\t{1:6d}\t{2:6d}\t{3:5.2f}\n".format(cat, tokens, word_types, tokens/word_types))
#17
"""Write a function that finds the 50 most frequently occurring words of a text that
are not stopwords."""
from nltk import FreqDist
from nltk.corpus import stopwords, brown
def freq_no_stop(text, lan="english"):
fd=FreqDist([w.lower() for w in text if not w.lower() in stopwords.words(lan) and w.isalpha()])
sorted_keys=sorted(list(fd.items()), key=lambda el: -el[1])
return [key for key, val in sorted_keys[:50]]
a=freq_no_stop(brown.words(categories='humor'))
#18
"""Write a program to print the 50 most frequent bigrams (pairs of adjacent words)
of a text, omitting bigrams that contain stopwords."""
from nltk import FreqDist
from nltk import bigrams
from nltk.corpus import stopwords, brown
def most_frequent_bigrams(text, lan="english"):
bigrams_=[big for big in bigrams(text) if (not big[0] in
stopwords.words(lan) and not big[1] in stopwords.words(lan) and big[0].isalpha() and big[1].isalpha())]
fd=FreqDist(bigrams_)
sorted_keys=sorted(list(fd.items()), key=lambda el: -el[1])
print([key for key, val in sorted_keys[:50]])
most_frequent_bigrams(brown.words(categories='humor'))
#20
"""Write a function word_freq() that takes a word and the name of a section of the
Brown Corpus as arguments, and computes the frequency of the word in that sec-
tion of the corpus."""
from nltk.corpus import brown
brown.fileids()
def word_freq(word, name):
return len([w for w in brown.words(categories=name) if w.lower()==word.lower()])
print(word_freq("was", 'humor'))
#21
"""Write a program to guess the number of syllables contained in a text, making
use of the CMU Pronouncing Dictionary."""
#I assume that number of syllables=number of 0s, 1s and 2s in the pronounciation
from nltk.corpus import cmudict
def n_syllables(sentence):
entr=cmudict.dict()
words=[word for word in sentence.replace('.','').replace(',','').split() if word.isalpha()]
return sum([len([phon for phon in entr.get(word, 'a')[0] if [c for c in list(phon) if c.isdigit()]]) for word in words])
print(n_syllables(" ".join(brown.words(categories='romance')[:10])))
" ".join(brown.words(categories='romance')[:10])
#22
"""Define a function hedge(text) that processes a text and produces a new version
with the word 'like' between every third word."""
#I don't filter non-alpha word types, so '.' is treated as a word
import numpy as np
def hedge(text):
l=len(text)
return np.insert(np.array(text),list(range(3,l+1,3)),'like').tolist()
print(hedge(brown.words(categories='romance')[:100]))
#23
"""Zipf’s Law: Let f(w) be the frequency of a word w in free text. Suppose that all
the words of a text are ranked according to their frequency, with the most frequent
word first. Zipf’s Law states that the frequency of a word type is inversely
proportional to its rank (i.e., f × r = k, for some constant k). For example, the 50th
most common word type should occur three times as frequently as the 150th most
common word type."""
from nltk import FreqDist
from nltk.corpus import brown
from matplotlib import pyplot as plt
import numpy as np
from scipy.stats import linregress
from math import exp
import random
def check_zipf_law(text):
fd=FreqDist(text)
sorted_items=sorted(list(fd.items()), key=lambda el: -el[1])
rank=np.array(range(1,len(sorted_items)+1,1))
log_rank=np.log(rank)
log_freq=np.log(np.array([freq for word, freq in sorted_items]))
plt.figure()
plt.plot(log_rank, log_freq)
slope, intercept, r_value, p_value, std_err = linregress(log_rank,log_freq)
print("To confirm Zipf's law, the slope of the plot should be equal to -1.\nIt is equal to {0:3.1f}.\nThe constant k is equal to {1:3.1f}.".format(slope, exp(intercept),))
plt.show()
check_zipf_law(brown.words())
random_text="".join([random.choice('abcdefghijklmn ') for i in range(0,10000000)]).split(" ")
check_zipf_law(random_text)
#24
"""Modify the text generation program in Example 2-1 further, to do the following
tasks:
a. Store the n most likely words in a list words , then randomly choose a word
from the list using random.choice() . (You will need to import random first.)
b. Select a particular genre, such as a section of the Brown Corpus or a Genesis
translation, one of the Gutenberg texts, or one of the Web texts. Train the
model on this corpus and get it to generate random text. You may have to
experiment with different start words. How intelligible is the text? Discuss the
strengths and weaknesses of this method of generating random text.
c. Now train your system using two distinct genres and experiment with gener-
ating text in the hybrid genre. Discuss your observations."""
from nltk import bigrams
from nltk import ConditionalFreqDist
def generate_model(cfdist, word, num=15, n_most_likely=5):
for i in range(num):
print(word, end=' ')
sorted_items=sorted(list(cfdist[word].items()), key=lambda el: -el[1])[:max([n_most_likely, len(cfdist[word].items())])]
l1=[word for word,val in sorted_items]
word=random.choice(l1)
print(' ')
brown.categories()
bigram=bigrams([word for word in brown.words(categories=['government', 'mystery']) if word.isalpha()])
cfd=ConditionalFreqDist(bigram)
cfd["I"].items()
generate_model(cfd, 'I', num=25, n_most_likely=6)
#25
"""Define a function find_language() that takes a string as its argument and returns
a list of languages that have that string as a word. Use the udhr corpus and limit
your searches to files in the Latin-1 encoding."""
from nltk.corpus import udhr
[file for file in udhr.fileids() if 'Latin1' in file]
from nltk import ConditionalFreqDist
def find_language(string):
text=string.split(" ")
text=[word for word in text if word.isalpha()]
l=len(text)
avail_langs=[file for file in udhr.fileids() if 'Latin1' in file]
cfd=ConditionalFreqDist([(lang, word) for lang in avail_langs for word in [word for word in text if word in udhr.words(lang)]])
ls=sorted([(lang,cfd[lang]) for lang in avail_langs], key=lambda tple: tple[1].N())
print("The most probable language of the text is {0} with {1:3.3f}% probability.".format(ls[-1][0].replace('-Latin1',''), 100*ls[-1][1].N()/l))
find_language(" ".join(udhr.words('Danish_Dansk-Latin1')[:20]))
#expected wrong results because Polish is not in the udhr corpus
find_language('Temistokles Brodowski z wydziału komunikacji społecznej biura potwierdził dziś, że agenci z katowickiej delegatury CBA zatrzymali byłego ministra sprawiedliwości, radcę prawnego Andrzeja K., jego współpracownika z kancelarii Piotra K. oraz gdańskiego biznesmena Marka S. i byłego funkcjonariusza Wojskowych Służb Informacyjnych Jerzego K. Andrzej K. był szefem resortu sprawiedliwości w drugim rządzie Marka Belki.')
#lets try french
find_language("Solidaire du mouvement des cheminots et outré par « le manque de dialogue social », Nicolas, ingénieur informatique naviguant entre Paris et Caen, parvient lui aussi à s’organiser en recourant au télétravail. « Les jours de grève, finalement tout le monde est chez soi, on travaille par “conf call” et on ne prend pas de rendez-vous », détaille-t-il.")
#26
"""What is the branching factor of the noun hypernym hierarchy? I.e., for every
noun synset that has hyponyms—or children in the hypernym hierarchy—how
many do they have on average? You can get all noun synsets using wn.all_syn
sets('n') ."""
from nltk.corpus import wordnet
hyponyms=[len(item.hyponyms()) for item in list(wordnet.all_synsets('n')) if item.hyponyms()]
sum(hyponyms)/len(hyponyms)
#27
"""The polysemy of a word is the number of senses it has. Using WordNet, we can
determine that the noun dog has seven senses with len(wn.synsets('dog', 'n')) .
Compute the average polysemy of nouns, verbs, adjectives, and adverbs according
to WordNet."""
from nltk.corpus import wordnet
all_nouns=list(set([synset._lemma_names[0] for synset in wordnet.all_synsets(pos=wordnet.NOUN)]))
all_verbs=list(set([synset._lemma_names[0] for synset in wordnet.all_synsets(pos=wordnet.VERB)]))
all_adj=list(set([synset._lemma_names[0] for synset in wordnet.all_synsets(pos=wordnet.ADJ)]))
all_adv=list(set([synset._lemma_names[0] for synset in wordnet.all_synsets(pos=wordnet.ADV)]))
pol_noun=[len(wordnet.synsets(syn, 'n')) for syn in all_nouns]
pol_verb=[len(wordnet.synsets(syn, pos=wordnet.VERB)) for syn in all_verbs]
pol_adv=[len(wordnet.synsets(syn, pos=wordnet.ADV)) for syn in all_adv]
pol_adj=[len(wordnet.synsets(syn, wordnet.ADJ)) for syn in all_adj]
whole=[sum(pol_noun)/len(pol_noun), sum(pol_verb)/len(pol_verb),sum(pol_adv)/len(pol_adv),sum(pol_adj)/len(pol_adj)]
labels=['noun', 'verb', 'adverb', 'adjective']
for i in range(len(labels)):
print("The average polisemy for {0} is equal to {1:4.2f}.".format(labels[i], whole[i]))
#28
"""Use one of the predefined similarity measures to score the similarity of each of
the following pairs of words. Rank the pairs in order of decreasing similarity. How
close is your ranking to the order given here, an order that was established exper-
imentally by (Miller & Charles, 1998): car-automobile, gem-jewel, journey-voyage,
boy-lad, coast-shore, asylum-madhouse, magician-wizard, midday-noon, furnace-
stove, food-fruit, bird-cock, bird-crane, tool-implement, brother-monk, lad-
brother, crane-implement, journey-car, monk-oracle, cemetery-woodland, food-
rooster, coast-hill, forest-graveyard, shore-woodland, monk-slave, coast-forest,
lad-wizard, chord-smile, glass-magician, rooster-voyage, noon-string."""
from nltk.corpus import wordnet as wn
text="car-automobile, gem-jewel, journey-voyage, boy-lad, coast-shore, asylum-madhouse, magician-wizard, midday-noon, furnace-stove, food-fruit, bird-cock, bird-crane, tool-implement, brother-monk, lad-brother, crane-implement, journey-car, monk-oracle, cemetery-woodland, food-rooster, coast-hill, forest-graveyard, shore-woodland, monk-slave, coast-forest, lad-wizard, chord-smile, glass-magician, rooster-voyage, noon-string"
text=text.replace(',', '')
text=text.split(' ')
pairs=[item.split('-') for item in text]
similarities=[min([wn.synsets(pairs[k][0])[i].path_similarity(wn.synsets(pairs[k][1])[j]) for i in range(len(wn.synsets(pairs[k][0]))) for j in range(len(wn.synsets(pairs[k][1]))) if type(wn.synsets(pairs[k][0])[i].path_similarity(wn.synsets(pairs[k][1])[j]))!=type(wn.synsets(pairs[0][0])[0].path_similarity(wn.synsets(pairs[0][1])[1]))]) for k in range(len(pairs))]
print(sorted([(pairs[i], similarities[i]) for i in range(len(pairs))], key=lambda x: x[1], reverse=True))