Beispiel #1
0
def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print gutenberg.fileids()

    # 挑选一个文本: 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print len(emma)

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
        print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
Beispiel #2
0
Datei: main.py Projekt: kwdhd/nlp
def gutenberg():
    from nltk.corpus import gutenberg
    for t in gutenberg.fileids():
        num_chars = len(gutenberg.raw(t))
        num_words = len(gutenberg.words(t))
        num_sents = len(gutenberg.sents(t))
        num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
Beispiel #3
0
def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars/num_words), int(num_words/num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words/num_vocab), fileid
Beispiel #4
0
def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
Beispiel #5
0
def page57():
    """Statistics from the Gutenberg corpora"""
    from nltk.corpus import gutenberg

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents),
        print int(num_words / num_vocab), fileid
Beispiel #6
0
def generateSentence():
    corpus = random.randint(0,3)
    if corpus == 0:
        text = brown.words()
    elif corpus == 1:
        text = gutenberg.words()
    elif corpus == 2:
        text = webtext.words()
    elif corpus == 3:
        text = movie_reviews.words()
    tweetString = ''
    lengthOfTweet = random.randint(0,20)
    len(text)
    firstRun = True
    blank = ' '
    startOfWord = ''
    startOfWordIndex = 0
    startingWord = random.randint(0, (len(text) - 40))
    punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"]

    for x in xrange(startingWord,(startingWord + len(text))):
        startOfWord = text[x]
        if startOfWord ==".":
                startOfWordIndex = x
                break

    for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet):
        if text[x] in punctuation:
            tweetString = tweetString + text[x]

        elif text[x] not in punctuation:
            tweetString = tweetString + blank + text[x]
    return tweetString
Beispiel #7
0
def main():
    # gutenberg
    gu_words = gutenberg.words()
    gu_words_exclude_stops = exclude_stopwords(gu_words)
    gu_fd1 = get_frequency_distribution(gu_words)
    gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)

    pylab.plot(gu_fd1, color='red')
    pylab.plot(gu_fd2, color='orange')

    # inaugural
    in_words = inaugural.words()
    in_words_exclude_stops = exclude_stopwords(in_words)
    in_fd1 = get_frequency_distribution(in_words)
    in_fd2 = get_frequency_distribution(in_words_exclude_stops)

    pylab.plot(in_fd1, color='black')
    pylab.plot(in_fd2, color='gray')

    # reuters
    yen_words = reuters.words(categories='yen')
    yen_words_exclude_stops = exclude_stopwords(yen_words)
    yen_fd1 = get_frequency_distribution(yen_words)
    yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)

    pylab.plot(yen_fd1, color='blue')
    pylab.plot(yen_fd2, color='green')

    pylab.xscale('log')
    pylab.yscale('log')
    pylab.show()
Beispiel #8
0
def ex2():
  from nltk.corpus import gutenberg
  ap = gutenberg.words("austen-persuasion.txt")
  word_tokens = len(ap)
  word_types = len(set([w.lower() for w in ap]))
  print "#-word tokens=", word_tokens
  print "#-word types=", word_types
Beispiel #9
0
def ex17():
  from nltk.corpus import gutenberg
  macbeth = gutenberg.words("shakespeare-macbeth.txt")
  stopwords = set(nltk.corpus.stopwords.words())
  fd = nltk.FreqDist([w for w in macbeth if w.lower() not in stopwords
      and len(w) > 3 and w.isalpha()])
  print fd.keys()[0:50]
def find_word_probability(CORPUS):
    ''' Find word occurrence probabilty from the given corpus'''
    cfd = ConditionalFreqDist()
    prev_word = None
    for word in gutenberg.words(CORPUS):
        cfd[prev_word][word] += 1
        prev_word = word
    return cfd
Beispiel #11
0
def fun01():
    """fun01"""
    print gutenberg.fileids()
    # emma by jane austen
    emma = gutenberg.words('austen-emma.txt')
    # how many words it contains
    print len(emma)
    print Text(emma).concordance("surprize")
def exercise2():
    print
    print "Exercise 2"
    words = gutenberg.words('austen-persuasion.txt')
    print "Number of word tokens in the text austen-persuasion.txt: %d" %len(words)
    print "Number of word-types in the text austen-persuasion.txt: %d" %len(set(words))
    print set(words)
    print
Beispiel #13
0
def main():
    loader = WordLoader()
    loader.load_valid_words_from_aspell("en_GB")
    loader.load_valid_words_from_aspell("en_US")
    all_words = brown.words() + gutenberg.words()
    sorted_words_filename = 'sorted_words.txt'
    loader.write_sorted_words(all_words, sorted_words_filename)
    sorted_words = loader.sorted_words
    print_anagrams(sorted_words, all_words)
Beispiel #14
0
def searchText():

    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    moby.findall(r"<a> (<.*>) <man>")
    chat = nltk.Text(nps_chat.words())
    chat.findall(r"<.*> <.*> <bro>") 
    chat.findall(r"<l.*>{3,}") 

    hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
    hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
Beispiel #15
0
def structure():

    raw = gutenberg.raw("burgess-busterbrown.txt")
    raw[1:20]

    words = gutenberg.words("burgess-busterbrown.txt")
    words[1:20]

    sents = gutenberg.sents("burgess-busterbrown.txt")
    sents[1:20]
Beispiel #16
0
def gutenberg():

    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print len(emma)

    print gutenberg.fileids()
    emma = gutenberg.words('austen-emma.txt')

    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    [s for s in macbeth_sentences if len(s) == longest_len]

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
def main():
    sample_rankings = FreqDist(gutenberg.words('austen-persuasion.txt'))
    training_set_rankings = udhr_rankings(debug=True)

    predictions = predict_language(sample_rankings, training_set_rankings, debug=True)

    print
    for language, value in predictions:
        if value != 0:
            # print '%.-32s\t%.-10s' % (language, value)
            print '{:.<32}{}'.format(language, value)
Beispiel #18
0
def ex18():
  from nltk.corpus import gutenberg
  macbeth = gutenberg.words("shakespeare-macbeth.txt")
  stopwords = set(nltk.corpus.stopwords.words())
  bigrams = nltk.bigrams(macbeth)
  print bigrams
  bigrams_wo_stopwords = filter(lambda (k, v) : k not in stopwords
    and v not in stopwords
    and k.isalpha()
    and v.isalpha(), bigrams)
  fd = nltk.FreqDist(map(lambda (k,v) : k+":"+v, bigrams_wo_stopwords))
  print map(lambda k : (k.split(":")[0], k.split(":")[1]), fd.keys())[0:50]
Beispiel #19
0
def exercise_unusual_words():
    text = gutenberg.words("austen-sense.txt")

    # 取出文本中的词汇, 去除数字, 转换为小写
    text_vocab = set(w.lower() for w in text if w.isalpha())

    # 取出词典中的词汇
    english_vocab = set(w.lower() for w in words.words())

    # 找出文本中的非常用词汇(错误词汇)
    unusual_vocab = text_vocab.difference(english_vocab)

    print sorted(unusual_vocab)
Beispiel #20
0
def searchTokenText():
    from nltk.corpus import gutenberg, nps_chat
    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    print moby.findall(r"<a> (<.*>) <man>")

    chat = nltk.Text(nps_chat.words())
    print chat.findall(r"<.*> <.*> <bro>")

    print chat.findall(r"<l.*>{3,}")

    from nltk.corpus import brown
    hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
    hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
Beispiel #21
0
def getCorrectWord(word):
    str = re.sub(re.compile('-\n'), '', word)
    words = gutenberg.words('austen-emma.txt')
    # print sorted(set([w.lower() for w in words]))
    # print len(sorted(set([w.lower() for w in words])))
    isExist = False
    for w in words:
        # print '...' + str + '...', w
        if str == w:
            return w
        # break
    # print ' all done.'
    if isExist is False:
        return removeBreak(word)
def find_phrases(regexp):
	fids = gutenberg.fileids()
	rs = []
	for fid in fids:
		txt = nltk.Text(gutenberg.words(fid))
		ts = nltk.text.TokenSearcher(txt)
		r = ts.findall(regexp)
		for x in r:
			if x[0].lower() in wrong_vbs:
				x[0] = 'looking at'
			if x[-1].lower() in wrong_vbs:
				x[-1] = 'me'
		rs.extend(r)

	return rs
def nltk_test_1():
	fd = FreqDist()
	# for each token in the relevant text, increment its counter
	for word in gutenberg.words('austen-persuasion.txt'):
		fd[word.lower()] += 1
	print fd.N()	# total number of samples
	print fd.B()	# number of bins or unique samples
	# Get a list of the top 10 words sorted by frequency
	l = []
	for word in fd.keys():
		tp = (word, fd[word])
		l.append(tp)
	l.sort(key = lambda x : x[1], reverse=True)
	for itr in l[:10]:
		print itr[0], itr[1]
Beispiel #24
0
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        x = volumize(dist)
        data.append((x, x.w))

    return data
Beispiel #25
0
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data
	def extract_meaningful_info(self):
		text=self.text
		text2=[]
		for line in text:
			text2+=line.split()
		words_spec=[]
		words_spec=[words for words in text2 if words not in list(set(gutenberg.words()[0:3000]))]
		tech_words=[words for words in text2 if words in dict_tech]
		link_from_readme=[elt.split('/')[2:] for elt in words_spec if 'https' in elt or 'http' in elt]
		link_from_readme=filter(lambda elt:elt!=[],link_from_readme)
		associated_website=[elt[0] for elt in link_from_readme if 'github' not in elt[0].lower()]
		associated_github=[elt[1]+'/'+elt[2] for elt in link_from_readme if 'github' in elt[0].lower()]
		self.words=words_spec
		self.links=link_from_readme
		self.associated_github=list(set(associated_github))
		self.associated_website=list(set(associated_website))
		self.tools=list(set([word for word in words_spec if isupper_(word) and 'http' not in word]))
		self.summarize=text2[0:50]
		self.technical=tech_words
		return words_spec
def nltk_test_3():
	# For each token, count current word given previous word.
	# Create distribution object.
	# cfd = ConditionalFreqDist()
	# for word in word_tokenize(sent):
	# 	condition = len(word)
	# 	cfd[condition][word] += 1
	cfd = ConditionalFreqDist((len(word), word) for word in gutenberg.words('austen-persuasion.txt'))
	# Start predicting at the given word, say ’therefore’
	word = 'therefore'
	i = 1
	print cfd.N()
	print cfd.conditions()
	# Find all words that can possibly follow the current word and choose one at random
	while i <= 20:
		print word,
		lwords = cfd[word]
		follower = choice(lwords)
		word = follower
		i += 1
 def __init__(self, blackboard,  min_words = 4, max_words = 8):
      super(NGramExpert, self).__init__(blackboard, "NGram Expert")
      self.blackboard = blackboard
      self.min = min_words
      self.max = max_words
      self.poems = list(gutenberg.words('blake-poems.txt'))
      self.poems.extend(list(gutenberg.words('whitman-leaves.txt')))
      self.poems.extend(list(gutenberg.words('shakespeare-macbeth.txt')))
      self.poems.extend(list(gutenberg.words('shakespeare-hamlet.txt')))
      self.poems.extend(list(gutenberg.words('shakespeare-caesar.txt')))
      self.poems.extend(list(gutenberg.words('milton-paradise.txt')))
      exclude = set(string.punctuation)
      self.poems = [w.lower() for w in self.poems if w not in exclude] 
      self.poem_bigrams = nltk.bigrams(self.poems) 
      self.cfd = nltk.ConditionalFreqDist(self.poem_bigrams)
def nltk_test_2():
	# Count each token in each text of the Gutenberg collection
	fd = FreqDist()
	for text in gutenberg.fileids():
		for word in gutenberg.words(text):
			fd[word.lower()] += 1
    # Initialize two empty lists which will hold our ranks and frequencies
	ranks = []
	freqs = []
	# Generate a (rank, frequency) point for each counted token and append to the respective lists
	for rank, word in enumerate(fd):
		ranks.append(rank + 1)
		freqs.append(fd[word])
	freqs.sort(reverse=True)

	# Plot rank vs frequency on a log􀀀log plot and show the plot
	plt.loglog(ranks, freqs)
	plt.xlabel('frequency(f)', fontsize = 14, fontweight = 'bold')
	plt.ylabel('rank(r)', fontsize = 14, fontweight = 'bold')
	plt.grid(True)
	plt.show()
Beispiel #30
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
# author: ‘CHENG CHI FUNG'
# student_id: '12219691'
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import brown
from nltk.corpus import wordnet as wn

nltk.download('gutenberg')
nltk.download('brown')
nltk.download('wordnet')

# load the words from corpus gutenberg
words = gutenberg.words('austen-sense.txt')

# load the sentences from corpus gutenberg
sents = gutenberg.sents('austen-sense.txt')

# sentences is a list of words
# raw = each character

# load the words from corpus romance
romance_words = brown.words(categories='romance')
# import the data from corpus hobbies and romance
hobbies_words = brown.words(categories='hobbies')


def q1():
    print('q1: {:}'.format(''))
    # 1. Print the number of word tokens
    # YOUR CODE
    print(len(words))
Beispiel #32
0
def get_corpus(text_name):
    return gutenberg.words(text_name)
Beispiel #33
0
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import brown

print('Printing file identifiers of Project Gutenberg for books: \n',
      gutenberg.fileids())

emma = gutenberg.words('austen-persuasion.txt')
print('\nChoosing "Persuasion" of Jane Austen and printing it\'s length: ',
      len(emma))

# to apply concordance from main.py we need to employ such statements:
# emma = nltk.Text(gutenberg.words('austen-emma.txt'))
# print('\nConcordance on other texts, other then from nltk.book (with word "surprize"): \n')
# emma.concordance('surprize')

# displaying other information about each text, by looping over all the values of fileid
print('FileId | num_chars |  num_sents | num_words | num_vocab')
for fileId in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileId))
    num_sents = len(gutenberg.sents(fileId))
    num_words = len(gutenberg.words(fileId))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileId)))
    print(fileId, num_chars, num_sents, num_words, num_vocab)

print('Categories of Brown Corpus: \n', brown.categories())

# comparing genres in their usage of modal verbs
print('Comparing news-genres in their usage of modal verbs:')
news_text = brown.words(categories='news')
fdist = nltk.FreqDist(w.lower() for w in news_text)
Beispiel #34
0
#%%
from nltk.corpus import gutenberg
from nltk.probability import *

allwords = gutenberg.words('shakespeare-hamlet.txt')
# A frequency distribution for the outcomes
''' sx = all characters in allwords, and store into a list with lower case 
[sx.lower() for sx in allwords if sx.isalpha()]'''
fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])

# get all unique values
print(fd2.B())

# get all values
print(fd2.N())

# output the front 20 values as a form of table
fd2.tabulate(20)
fd2.plot(20)
# cumulative = add up
fd2.plot(20, cumulative=True)
# %%
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk.corpus import gutenberg  # 导入 gutenberg 集
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(
    gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(
    fd
)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B()
      )  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(
Beispiel #36
0
import nltk
from nltk.corpus import gutenberg
emma = gutenberg.words("austen-emma.txt")
"""
# 各ファイルの単語平均長、文平均長、各単語の出現回数を算出
for fileid in gutenberg.fileids():
    # 総文字数
    num_chars = len(gutenberg.raw(fileid))
    # 総単語数
    num_words = len(gutenberg.words(fileid))
    # 総文数
    num_sents = len(gutenberg.sents(fileid))
    # 異なり文字数
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid)
"""
"""
# 法助動詞の数をカウント
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist(news_text)
modals = ['can', 'could', 'may', 'might', 'will', 'must']
for m in modals:
    print('{0} : {1}'.format(m, fdist[m]))
"""
"""
# 法助動詞の数でジャンル予測
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
Beispiel #37
0
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import gutenberg, genesis, inaugural,\
       nps_chat, webtext, treebank, wordnet
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'),
Beispiel #38
0
import nltk  # Notes: See fileids in Gutenberg corpus | Corpus is a large body of text. Gutenberg collection contains 25,000 free electronic books
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words(
    'austen-emma.txt'
)  # Notes: Pick out the text Emma from Jane Austen and name it emma. Count the number of words in it.
emma = nltk.Text(nltk.corpus.gutenberg.words(
    'austen-emma.txt'))  # Notes: Use the concordance function on emma.
print(
    "Number of words in emma file in gutenberg corpus: ", len(emma)
)  # Notes: Library/Package --> Package --> Object --> Method/Function  ==  NLTK --> corpus --> gutenberg --> words
from nltk.corpus import gutenberg  # Notes: Avoid using long statements by using 'the import statement'
for fileid in gutenberg.fileids(
):  # Notes: Loop over all fileids in Gutenberg corpus and print following statistics: avg chars/word; avg words/sentence; a lexical diversity score; fileids.
    char_count = len(gutenberg.raw(fileid))
    word_count = len(gutenberg.words(fileid))
    sent_count = len(gutenberg.sents(fileid))
    vocab_count = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(char_count / word_count), round(word_count / sent_count),
          round(word_count / vocab_count), fileid)
macbeth_sentences = gutenberg.sents(
    'shakespeare-macbeth.txt'
)  # Notes: Dispalay the longest sentence from the macbeth text and its length
print("\n112th Macbeth sentence: ", macbeth_sentences[111])
print("\nNumber of sentences in Macbeth: ", len(macbeth_sentences))
longest_length = max(len(s) for s in macbeth_sentences)
longest_sentence = [
    sentence for sentence in macbeth_sentences
    if len(sentence) == longest_length
]
print("\nLength of longest sentence in Macbeth: ", longest_length)
Beispiel #39
0
-------------------------
main
-------------------------
'''
if __name__ == '__main__':
    scores_lst = []
    weights = [11, 33, 50, 0.04, 4]
    sig_list = []
    fileids = gutenberg.fileids()
    print('\n\nCalculating Table of Signatures...')
    print('\n{:>25} {:>12} {:>12} {:>12} {:>12} {:>12}\n'.format(
        'File Name:', 'word_len:', 'lex_div:', 'hap_rat:', 'sent_len:',
        'sent_comp:'))
    for fid in fileids:
        # compute features, make a list of features
        words = gutenberg.words(fid)
        sents = gutenberg.sents(fid)
        sig = compute_signature(words, sents, fid)
        sig_list.append(sig)
        print(
            '{:>25} {:>12.4f} {:>12.4f} {:>12.4f} {:>12.4f} {:>12.4f}'.format(
                sig[0], sig[1], sig[2], sig[3], sig[4], sig[5]))

    write_signatures(sig_list, 'out.txt')

    n_files = int(input('Enter Number of Mystery Files: '))
    m_sig_list = []
    for f in range(n_files):
        filename = input('Enter Name of Mystery File: ')
        raw_text = read_text(filename)
        m_words = gutenberg.words(filename)