def nltk_base_books():
    # get context of word
    text2.concordance("monstrous")
    text2.similar("monstrous")
    text2.common_contexts(["monstrous", "very"])

    # position of word - dispersion plot
    text4.dispersion_plot(
        ["citizens", "democracy", "freedom", "duties", "America"])

    # generate random text from a book
    text2.generate()

    # counting vocabulary in a text
    len(text2)  # count all words and puntuatations; i.e. count tokens
    len(set(text2))  # count uniques; i.e. count vocab or word types
    sorted(set(text4))  # sorted list of vocab or word types
    len(set(text2)) / len(text2)  # measure of lexical richness of the text
    text2.count("pretty")  # specific word count
    text2.count("pretty") / len(text2)  # specific word count as % of full text

    # indexing -- equiv( py lists)
    text2[123]
    text2.index("pretty")
    text2[0:15]

    # string ops as usual
    "the quick brown fox".split()
    " === ".join(text2[:20])
    "Hello " * 3
Example #2
0
nltk.download('webtext')
nltk.download('treebank')
from nltk.corpus import inaugural

text = inaugural.raw()
wordcloud = WordCloud(max_font_size=60).generate(text)
plt.figure(figsize=(16, 12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
from nltk.book import text4 as inaugural_speeches

plt.figure(figsize=(16, 5))
topics = ['sports', 'news', 'Government']
inaugural_speeches.dispersion_plot(topics)
from nltk.corpus import brown

stop_words = set(STOPWORDS)
topics = ['Sports - الرياضة', 'News - الاخبار', 'Government - السياسة']
for topic in topics:
    words = [
        word for word in brown.words(categories=topic)
        if word.lower() not in stop_words and word.isalpha()
    ]
    freqdist = nltk.FreqDist(words)
    print(
        topic, 'more :',
        ' , '.join([word.lower() for word, count in freqdist.most_common(5)]))
    print(
        topic, 'less :', ' , '.join(
Example #3
0
from nltk.book import text1
from nltk.book import text4
from nltk.book import text6

print(text1.concordance("monstrous"))
print(text1.similar("monstrous"))
print(text1.collocations())
text4.dispersion_plot(
    ["citizens", "democracy", "freedom", "duties", "America"])

print(text6.count("Very"))
print(text6.count('the') / float(len(text6)) * 100)
print(text4.count("bless"))
print(text4[100])
print(text4.index('the'))
print(text4[524])
print(text4.index('men'))
print(text4[0:len(text4)])
#  examine just the contexts that are shared by two or more words
text4.common_contexts(["war", "freedom"])

# Counting Vocabulary: the length of a text from start to finish,
# in terms of the words and punctuation symbols that appear. All tokens.
len(text4)
#  How many distinct words does the book of Genesis contain?
# The vocabulary of a text is just the set of tokens that it uses.
len(set(text4))  #types
len(text4) / len(
    set(text4))  # Each word used on average x times. Richness of the text.

#Location of a word in the text: how many spaces from the beginning does it appear?
#This positional information can be displayed using a dispersion plot.
#You need NumPy and Matplotlib.
text4.dispersion_plot(
    ["citizens", "democracy", "freedom", "war", "America", "vote"])

# Generating some random text in the various styles we have just seen.
text4.generate()

# count how often a word occurs in a text,
text4.count("democracy")
# compute what percentage of the text is taken up by a specific word
100 * text4.count('democracy') / len(text4)


# Define functions:
def lexical_diversity(text):
    return len(text) / len(set(text))

Example #5
0
'''
下载测试数据
'''
# nltk.download()

print '===============查找关键词=================='
t1.concordance("america")

print '===============查找相似上下文==============='
t1.similar("america")

print '=============共同的语法结构================='
t1.common_contexts(['in', 'of'])

print '=================词汇分布图================='
t4.dispersion_plot(['citizens', 'democaracy', 'freedom', 'america'])

print '=================统计最常出现的词================'
freList = nk.FreqDist(t1)
freList.plot(50, cumulative=False)

print '=================统计长度超过15的词==============='
v = set(t1)
long_words = filter(lambda x: len(x) > 15, v)[:10]
print long_words

print '=================常用双连词搭配==============='
tuple = nk.bigrams(['all', 'in', 'of', 'take', 'like'])
for x in tuple:
    print x
text4.similar("vote")
#  examine just the contexts that are shared by two or more words
text4.common_contexts(["war", "freedom"])

# Counting Vocabulary: the length of a text from start to finish, 
# in terms of the words and punctuation symbols that appear. All tokens. 
len(text4)
#  How many distinct words does the book of Genesis contain? 
# The vocabulary of a text is just the set of tokens that it uses. 
len(set(text4)) #types
len(text4) / len(set(text4)) # Each word used on average x times. Richness of the text. 

#Location of a word in the text: how many spaces from the beginning does it appear? 
#This positional information can be displayed using a dispersion plot. 
#You need NumPy and Matplotlib. 
text4.dispersion_plot(["citizens", "democracy", "freedom", "war", "America", "vote"])

# Generating some random text in the various styles we have just seen.
text4.generate()

# count how often a word occurs in a text,
text4.count("democracy")
# compute what percentage of the text is taken up by a specific word
100 * text4.count('democracy') / len(text4)

# Define functions: 
def lexical_diversity(text):
    return len(text) / len(set(text))

def percentage(count, total):
    return 100 * count / total