from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) print corpus WhitespaceTokenizer().tokenize(corpus) print corpus for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) # Quantas vezes a palavra form aparece no corpus? freq_dist.count('form') # Qual é a freqüência da palavra form? freq_dist.freq('form') # Quantas palavras foram contadas? freq_dist.N() # Quais foram os tipos de palavras encontradas? freq_dist.samples() # Qual é a palavra mais comum? freq_dist.max()
from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) print corpus WhitespaceTokenizer().tokenize(corpus) print corpus for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) # How many times did "the" occur? freq_dist.count('the') # What was the frequency of the word "the"? freq_dist.freq('the') # How many word tokens were counted? freq_dist.N() # What word types were encountered? freq_dist.samples() # What was the most common word? freq_dist.max() # What is the distribution of word lengths in a corpus? freq_dist = FreqDist()
# Um exemplo da lei de Zipf from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) WhitespaceTokenizer().tokenize(corpus) for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) wordcount = freq_dist.samples() #points = [(freq_dist.freq(l),l) for l in wordcount] #points.sort() x = 0 points = list(wordcount) for l in wordcount: points[x] = (freq_dist.count(l), x) x = x + 1 points.sort() print points Plot(points)