Example #1
0
def main():
    s1=pre(inaugural.raw('2009-Obama.txt'))
    sx=inaugural.fileids()
    for file in sx:
        s2=pre(inaugural.raw(file))
        #inter=set(s1) & set(s2)
        similarity1=similarity(s1,s2)
        print(similarity1,file)
Example #2
0
def g():
    # t = inaugural.raw("1789-Washington.txt")
    # print(len(t))
    sdbsearch = SimpleDBSearch()
    for url in nltk.corpus.inaugural.fileids()[:10]:
        data = inaugural.raw(url)
        print "indexing url ", url, len(data), type(data)
        sdbsearch.index(url, data)

    print "-- writing index to sdb"
    sdbsearch.writeIndexToSDB()
def avgWord():
    x1 = []
    y1 = []
    for fileid in inaugural.fileids():
        words = inaugural.raw(fileids=fileid)
        words = words.split()
        average = sum(len(word) for word in words) / len(words)
        print(fileid[:4], "-", average)
        y1.append(fileid[:4])
        x1.append(average)

    plt.title('Średnia długość słowa:')
    plt.xticks(rotation=90)
    plt.plot(y1, x1)
    plt.show()
def getGraphs():
    index = 0
    for id in inaugural.fileids():  #prob(-14)
        index += 1
        ww = inaugural.raw(id).lower()
        num_war = ww.count('war')
        num_america = ww.count('america')
        num_economy = ww.count('economy')
        num_world = ww.count('world')
        plot(index, num_war, 'mo')  #war
        plot(index, num_america, 'go')  #america (increasing)
        plot(index, num_economy, 'ro')  #ecomony
        plot(index, num_world, 'bo')  #world (increasing)
        xlabel('index, purple-war, green-america, red-economy, world-blue')
        ylabel('the frequency of the words used')
    show()
Example #5
0
def main():
    ##nltk.download('reuters')
    nltk.download('inaugural')
    nltk.download('punkt')
    docinaug=inaugural.fileids()
    documents = reuters.fileids()
    print(str(len(documents)))
    print(reuters.raw("test/15556"))
    forwardDict,backwardsDict,probMatrix,probUniMatrix,totalProb=tokenize(reuters.raw("test/15556"))

    ##print(documents[1])
    ##print(docinaug[1])
    #forwardDict,backwardDict,probMtrx=tokenize("the man. the man. the man")
    sent_token=word_tokenize("hello my friend how are you")
    print("a")
    print(sentence_perplex(inaugural.raw(docinaug[1]),probMatrix,forwardDict,probUniMatrix))
def graphWords():
    index = 0
    for id in inaugural.fileids():
        index += 1
        nchar = len(inaugural.raw(id)) * 1.0
        nword = len(inaugural.words(id)) * 1.0
        nsent = len(inaugural.sents(id)) * 1.0
        nvoc = len(set(w.lower() for w in inaugural.words(id))) * 1.0
        a = nchar / nword
        b = nword / nsent
        c = nword / nvoc
        plot(index, a, 'mo')  #purple color
        plot(index, b, 'go')  #green color
        plot(index, c, 'ro')  #red color

        xlabel(
            'index, from Washington to Obama (purple - character/word), (red - word/vocab)'
        )
        ylabel('Average numbers (green - word/sentence)')
    show()
def senti():
    x3 = []
    y3 = []
    x31 = []

    for fileid in inaugural.fileids():
        text = inaugural.raw(fileids=fileid)
        senti = TextBlob(text)
        print(fileid[:4], "-", senti.sentiment)
        y3.append(fileid[:4])
        x3.append(senti.sentiment[0])
        x31.append(senti.sentiment[1])
    plt.title('Polarity')
    plt.xticks(rotation=90)
    plt.plot(y3, x3)
    plt.show()
    plt.title('Subjectivity')
    plt.xticks(rotation=90)
    plt.plot(y3, x31)
    plt.show()
    def __append_corpus_data(self):
        """
        Appends data to the questions and statements files from the inaugural address corpus
        """
        sentences = []

        # Use the Presidential inaugural addresses corpus
        for fileid in inaugural.fileids():
            raw_text = inaugural.raw(fileid)
            sentence_tokens = nltk.sent_tokenize(raw_text)
            sentences += sentence_tokens
        random.shuffle(sentences)
        random.shuffle(sentences)
        random.shuffle(sentences)

        # Write sentences to the sentences and questions files
        for sentence in sentences:
            if sentence and 10 < len(sentence) < 75:
                if sentence.endswith('?'):
                    self.q_out.write(self.__strip_sentence(sentence) + '\n')
                else:
                    self.s_out.write(self.__strip_sentence(sentence) + '\n')
Example #9
0
__author__ = 'rich'
import nltk
from nltk.corpus import inaugural, stopwords

train = inaugural.raw("1789-Washington.txt")
words = train.split()
words_clean = []
for word in words:
    if word not in stopwords.words("english"):
        words_clean.append(word)

index = {}
for word in words_clean:
    if word in train:
        if word not in index.keys():
            index[word] = ['1789-Washington.txt']
        elif "1789-Washington.txt" not in index[word]:
            index[word].append("1789-Washington.txt")

print "break"
Example #10
0
import nltk
from nltk.corpus import inaugural

speech = inaugural.raw('1789-Washington.txt')

print speech



Example #11
0
# coding: utf-8

# In[6]:

import nltk
from nltk.corpus import inaugural
from nltk.tokenize import PunktSentenceTokenizer
nltk.data.path.append('F:/nltk_files/')

train_dataset = inaugural.raw('1789-Washington.txt')
test_dataset = inaugural.raw('1793-Washington.txt')

punkt_tokenizer = PunktSentenceTokenizer(train_dataset)
tokenized_text = punkt_tokenizer.tokenize(test_dataset)

def find_ner():
    try:
        for i in tokenized_text:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
            namedEnt.draw()

    except Exception as e:
        print(str(e))
        
find_ner()


# In[ ]:
Example #12
0
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import inaugural

file_content = inaugural.raw('2009-Obama.txt')

tokens = word_tokenize(file_content)
print('\nTokens List:\n')
print(set(list(tokens)))
length = len(list(tokens))
result = list()
gramslist = ngrams(tokens, 1)
dictionary = {}
for gram in gramslist:
    if str(gram) in dictionary:
        dictionary[str(gram)] += 1
    else:
        dictionary[str(gram)] = 1
print(len(dictionary))
Example #13
0
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import inaugural
import operator

file_content1 = inaugural.raw('2009-Obama.txt')
file_content2 = inaugural.raw('1789-Washington.txt')
tokens1 = word_tokenize(file_content1)
tokens2 = word_tokenize(file_content2)
length1 = len(list(tokens1))
length2 = len(list(tokens2))
gramslist1 = ngrams(tokens1, 1)
gramslist2 = ngrams(tokens2, 1)
dictionary1 = {}
dictionary2 = {}


def sort_dict(dictionary):
    res = sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True)
    return res


for gram in gramslist1:
    if str(gram) in dictionary1:
        dictionary1[str(gram)] += 1
    else:
        dictionary1[str(gram)] = 1
for gram in gramslist2:
    if str(gram) in dictionary2:
        dictionary2[str(gram)] += 1
    else:
Example #14
0
r = requests.get("http://"+url)
website = r.text                   #Gets website content 

soup = BeautifulSoup(website,"lxml")      #Makes the website content into a beautiful soup object with a backend lxml parser
#print soup.prettify()

for script in soup(["script", "style"]):     #Rip out script and styling text
    script.extract()

clean_text = soup.get_text(" ")      #Obtains text from soup object separated by a space

lines = (line.strip() for line in clean_text.splitlines())            # break multi-headlines into a line each and join them to get rid of conjoined words
text = '\n'.join(line for line in lines)


trainText = inaugural.raw("2009-Obama.txt")                         #Training the punkt sentence tokenizer for POS tagging
custom_sent_tokenizer = PunktSentenceTokenizer(trainText)
tokenized = custom_sent_tokenizer.tokenize(text)

posContent = process_content(tokenized)         #Processes tokenized words and passes into the process_content function to tag the words

adjectives = []               #Creating an array to store the adjectives in
nouns = []

#JJ = Adjectives
#NN = Nouns
#NNS = Plural nouns
#VB = Verb

for i in posContent:
	if i[1] == "JJ":    # If the word is tagged as an adjective 
Example #15
0
        rv = float(count) / len(dw)
    except ZeroDivisionError:
        rv = 1
    return rv


def idf(term, corpus):
    count = 0
    for doc in corpus:
        if term.lower() in doc.lower():
            count += 1
    return math.log(1 + float(len(corpus)) / count)


if __name__ == '__main__':
    q = ['fellow', 'citizens']
    corpus = []
    files = ['1789-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt']
    for file in files:
        corpus.append(inaugural.raw(file))
    corpus.append("how now brown cow")

    for file in corpus:
        tf1 = tf(q[0], file)
        tf2 = tf(q[1], file)
        print "tf: %s is %f" % (q[0], tf1)
        print "tf: %s is %f" % (q[1], tf2)
    i1 = idf(q[0], corpus)
    i2 = idf(q[1], corpus)
    print "IDF1: %s" % (i1, )
    print "IDF2: %s" % (i2, )
Example #16
0
# -*- coding: utf-8 -*-
# リスト 4-1(2) 文に分割し、さらに単語に分割して数える例
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import inaugural
from collections import Counter
sents_Washington = nltk.tokenize.sent_tokenize(
    inaugural.raw('1789-Washington.txt'))
sents_Kennedy = nltk.tokenize.sent_tokenize(inaugural.raw('1961-Kennedy.txt'))
sents_Obama = nltk.tokenize.sent_tokenize(inaugural.raw('2009-Obama.txt'))

cnt_Washington = Counter(len(sent.split()) for sent in sents_Washington)
cnt_Kennedy = Counter(len(sent.split()) for sent in sents_Kennedy)
cnt_Obama = Counter(len(sent.split()) for sent in sents_Obama)
print(sorted(cnt_Washington.items(), key=lambda x: [x[1], x[0]], reverse=True))
print(sorted(cnt_Kennedy.items(), key=lambda x: [x[1], x[0]], reverse=True))
print(sorted(cnt_Obama.items(), key=lambda x: [x[1], x[0]], reverse=True))

nstring_Washington = np.array([len(sent.split()) for sent in sents_Washington])
nstring_Kennedy = np.array([len(sent.split()) for sent in sents_Kennedy])
nstring_Obama = np.array([len(sent.split()) for sent in sents_Obama])

plt.hist([nstring_Washington, nstring_Kennedy, nstring_Obama],
         color=['blue', 'red', 'green'],
         label=['1789年ワシントン', '1961年ケネディ', '2007年オバマ'])
plt.title('1789年ワシントン/1961年ケネディ/2007年オバマ就任演説の文ごとの単語数分布')

plt.xlabel('文の単語数')
plt.ylabel('出現頻度')
plt.legend()
Example #17
0
def benchmark():
    data = inaugural.raw(nltk.corpus.inaugural.fileids())
    firsterms = " ".join(data.split()[:1000]).split()

    for term in firsterms:
        q(term.lower())
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
'''



####inaugural 自带数据
train_text=inaugural.raw('1789-Washington.txt')
sample_text=inaugural.raw('2009-Obama.txt')

#sentence tokenization
custom_sent_tokenizer=PunktSentenceTokenizer(train_text)
tokenized=custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
        
Example #19
0
import nltk
from nltk.corpus import inaugural, stopwords
from nltk import sent_tokenize, word_tokenize
from nltk.stem import *
import pprint

# initialize pprint
pp = pprint.PrettyPrinter(indent=4)

# list of all speeches
ids = inaugural.fileids()

data = '1789-Washington.txt'

# get speech of particular file
speech = inaugural.raw(data)
speech = speech.lower()

# get sentences
sentences = inaugural.sents(data)

# sentence tokenize
sent_tokens = sent_tokenize(speech)

# print sentence
#pp.pprint(sent_tokens)

# word tokenize
word_tokens = word_tokenize(speech)

# print words
Example #20
0
from string import punctuation

import nltk
from nltk.corpus import stopwords, inaugural
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

#Perfoming different nltk preprocessing of your data

#Data We will use
sample = inaugural.raw("2009-Obama.txt")

#Tokenizing : Breaking down the body of text
print(sent_tokenize(sample))
#Sentence Tokenizing: Breaking down by sentence
print(word_tokenize(sample))
#Word Tokenizing:Breaking down body of text by words

#STOP WORDS: removing grammar and prepositions that add no meaning to data
stop_words = set(stopwords.words('english'))
print(stop_words)
stop_words = set(
    stopwords.words('english') + list(punctuation) + [u"'s", '""'])
print(stop_words)

#removing stop words from copora
allwords = []
for w in word_tokenize(sample):
    if w not in stop_words:
        allwords.append(w)
    ngrams_stats_tri={}
    ngrams_stats_bi={}
    ngrams_stats_bi_rev={}
    ngrams_stats_tri_rev={}
'''
#class

ngrams_stats_tri = {}
ngrams_stats_bi = {}
ngrams_stats_bi_rev = {}
ngrams_stats_tri_rev = {}
vocab = Counter()
#choose sample
sample1 = brown.raw()
sample2 = gutenberg.raw()
sample3 = inaugural.raw()
sample5 = nltk.corpus.state_union.raw()
sample4 = genesis.raw('english-web.txt')
sample = sample1 + sample2 + sample3 + sample4 + sample5
vocab, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev, ngrams_stats_bi_rev = mainTrain(
    vocab, sample, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev,
    ngrams_stats_bi_rev)
'''
with open('ngrams_stats_tri.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_tri, hfile)
with open('ngrams_stats_bi.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_bi, hfile)
with open('ngrams_stats_tri_rev.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_tri_rev, hfile)
with open('ngrams_stats_bi_rev.pkl', 'w') as hfile:
    pickle.dump(ngrams_stats_bi_rev, hfile)
Example #22
0
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

train, vocab = padded_everygram_pipeline(2, state_union.sents())
lm = KneserNeyInterpolated(2)
lm.fit(train, vocab)
print(lm.counts['America'])
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

#EXERCISE 3

train, vocab = padded_everygram_pipeline(2,
                                         state_union.sents('1945-Truman.txt'))
lm = MLE(2)
lm.fit(train, vocab)
print(lm.generate(100))

# Exercice 4

from neuralLG import dataset_preparation, create_model, generate_text
data = inaugural.raw()

X, Y, msl, total_words = dataset_preparation(data)
model = create_model(X, Y, msl, total_words)

text = generate_text("", 3, msl, model)
print(text)
# Organised Named Entity Recognition
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import inaugural
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

result = []

nltktag = nltk.ne_chunk(
    nltk.pos_tag(word_tokenize(inaugural.raw("1789-Washington.txt"))))

for subtree in nltktag:
    if type(subtree) == Tree:
        result.append(
            (" ".join([Y for Y, Z in subtree.leaves()]), subtree.label()))

print(result)
Example #24
0
# The Brown corpus:

#Each corpus is accessed by means of a "corpus reader" object from nltk.corpus
print(str(nltk.corpus.brown).replace('\\\\', '/'))
# The Penn Treebank Corpus:
print(str(nltk.corpus.treebank).replace('\\\\', '/'))
# The Name Genders Corpus:
print(str(nltk.corpus.names).replace('\\\\', '/'))
# The Inaugural Address Corpus:
print(str(nltk.corpus.inaugural).replace('\\\\', '/'))
print(str(nltk.corpus.treebank.fileids()))  # doctest: +ELLIPSIS
#print(str(nltk.corpus.inaugural.fileids()) # doctest: +ELLIPSIS
# Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus.

from nltk.corpus import inaugural
print(inaugural.raw('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.words('1789-Washington.txt'))
print(inaugural.sents('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.paras(
    '1789-Washington.txt'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE

#

l1 = len(inaugural.words('1789-Washington.txt'))
l2 = len(inaugural.words('1793-Washington.txt'))
l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt']))
print('%s+%s == %s' % (l1, l2, l3))

print(len(inaugural.words()))

print(inaugural.readme())
#Named Entity Recognition
#Chunking with NLTK with the help of regular expressions

#get all imports

import nltk
from nltk.corpus import inaugural
from nltk.tokenize import PunktSentenceTokenizer

#Create training and testing data

train_data = inaugural.raw("1789-Washington.txt")
sample_data = inaugural.raw("1793-Washington.txt")

train_tokenizer = PunktSentenceTokenizer(train_data)


def named_entity_recognition():
    try:
        print([
            nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(w)),
                          binary=False).draw()
            for w in train_tokenizer.tokenize(sample_data)
        ])

    except Exception as e:
        print(str(e))


named_entity_recognition()
Example #26
0
print("-------WARM UP---------")
print("------TASK 1---------")
#using inaugural fileids to list all the documents
documents = inaugural.fileids()
print(
    "Using the corpus reader class list all the documents in inaugural corpus :"
)
print(documents)
print("---------------------------------------------------------------------")
print("Find the total number of words in Clinton’s 1993 speech :")
#using .worrds method to count words in clinton speech
clintonwords = (inaugural.words('1993-Clinton.txt'))

print(len(clintonwords))
#.raw method will read the text in raw form
s = inaugural.raw('1789-Washington.txt')
w = set(m.group(0) for m in re.finditer(r"\w+", s))
#print (len(re.findall('\w+', s)))
print("Find the total number of distinct words in the same speech :")
#now we will find length of distinct words
print(len(w))


# average function to calculate average word length
def average(numbers):
    return sum(numbers) / len(numbers)


lengths = [len(word) for word in clintonwords]
print('Find the average word type length of same speech.:')
print(average(lengths))
Example #27
0
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import nltk

nltk.download('inaugural')
nltk.download('gutenberg')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
from nltk.corpus import inaugural

text = inaugural.raw()
wordcloud = WordCloud(max_font_size=60).generate(text)
plt.figure(figsize=(16, 12))
# plot wordcloud in matplotlib
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
from nltk.book import text4 as inaugural_speeches

plt.figure(figsize=(16, 5))
topics = ['sports', 'news', 'Government']
inaugural_speeches.dispersion_plot(topics)
from nltk.corpus import brown

stop_words = set(STOPWORDS)
topics = ['Sports - الرياضة', 'News - الاخبار', 'Government - السياسة']
for topic in topics:
    words = [
        word for word in brown.words(categories=topic)
        if word.lower() not in stop_words and word.isalpha()
Example #28
0
    print(gramslist)
    print('\n\n')
    print(freq.items())
    print('\n\n')
    problist = [(i, freq[i] / len(gramslist)) for i in gramslist]
    print('Probability-', problist)


file_content = open("input_text.txt").read()
tokens = word_tokenize(file_content)
print('\nTokens List:\n')
print(tokens)

get_ngram(tokens, 3)

obwords = word_tokenize(inaugural.raw('2009-Obama.txt'))
waswords = word_tokenize(inaugural.raw('1789-Washington.txt'))
print('\n\nOBAMA')
ob = FreqDist(obwords)
print('No. of words:', len(obwords))
print('No. of distinct words:', len(ob.keys()))

sortob = sorted(ob.items(), key=lambda x: x[1])
print('\n\nOBAMA50-', sortob[-50:])

was = FreqDist(waswords)
sortwas = sorted(was.items(), key=lambda x: x[1])
print('\n\nWASHINGTON0-', sortob[-50:], '\n\n')

obuni = FreqDist(list(ngrams(obwords, 1)))
obbi = FreqDist(list(ngrams(obwords, 2)))
Example #29
0
from nltk.corpus import inaugural, stopwords
from nltk import word_tokenize
from string import punctuation

stop_words = set(stopwords.words('english')) | set(punctuation)
obamatext = inaugural.raw("2009-Obama.txt")
obamalist = word_tokenize(obamatext)
obamalist = [word for word in obamalist if word not in stop_words]


def getNGrams(input_list, n):
    return [
        ' '.join(input_list[i:i + n])
        for i in range(len(input_list) - (n - 1))
    ]


def getallNGram(input_list):
    allngrams = dict()
    for i in range(1, 5):
        allngrams[str(i) + "_gram"] = getNGrams(input_list, i)
    return allngrams


def MostCommon(ngrams):
    for k in ngrams.keys():
        d = dict()
        for v in ngrams[k]:
            d[v] = ngrams[k].count(v)
        print(sorted(d, key=lambda x: d[x], reverse=True)[:5], '\n')
Example #30
0
import nltk
import itertools

# importing the inaugural corpus
from nltk.corpus import inaugural

# importing stopwords
from nltk.corpus import stopwords

# importing Frequency distribution
from nltk.probability import FreqDist

words = inaugural.words('1993-Clinton.txt')
speech = inaugural.raw('1993-Clinton.txt')
stop_words = set(stopwords.words('english'))

# listing all the documents in the inaugural corpus
# print(inaugural.fileids())

# the number of words in clintons 1993 speech
print('Total number of words in the given text is:',
      len(inaugural.words('1993-Clinton.txt')))

# unique words in the speech
unique_words = sorted(set(inaugural.words('1993-Clinton.txt')))

print('Number of unique words in the given text is:', len(unique_words))


# function that returns the avg length of words in a word list.
def avg_word_length(word_list):
Example #31
0

def re(corpus):
    myRE1 = RAKE_tagged(100, stopwords='auto', pos=["N", "VBP", "R"])  #100
    myRE = RAKE_tagged(80, stopwords='auto', pos=["N"])  #30
    summary = myRE1.transform(corpus, output_type="s")
    summaries = ["; ".join(s) for s in summary]
    keywords = myRE.transform(summaries, output_type="w")
    CF = Concept_finder()
    CF.fit(myRE1.finaltext)
    arr = CF.transform(keywords)
    CL = Clustering(arr, CF.model)
    CL.fit_transform(20)  #50
    CL.visualize()


#load the data
#df= pd.read_csv('../input/stage2_test_text.csv', sep='\|\|', header=None, skiprows=1, names=["ID","Text"])

from nltk.corpus import inaugural

sample = [{
    'ID': fileid,
    'Text': inaugural.raw(fileid)
} for fileid in inaugural.fileids()]
df = pd.DataFrame(sample)
#df.head()

df_txt = df['Text']

re(df_txt)
from nltk.corpus import inaugural, reuters, brown, gutenberg

from itertools import product as iter_product

def words(text):
    return re.findall('[a-z]+', text.lower())


def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(inaugural.raw() + reuters.raw() + brown.raw() + gutenberg.raw()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits1(word):
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts    = [a + c + b     for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
Example #33
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# リスト 4-2 文書全体を単語に分解し、出現頻度を数えるプログラム例
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import inaugural
from collections import Counter

sents = nltk.tokenize.sent_tokenize(inaugural.raw('1789-Washington.txt'))

cnt = Counter(len(sent.split()) for sent in sents)
print(sorted(cnt.items(), key=lambda x: [x[1], x[0]], reverse=True))

nstring = np.array([len(sent.split()) for sent in sents])
plt.hist(nstring)
plt.title('1789年ワシントン就任演説の文ごとの単語数分布')
plt.xlabel('文の単語数')
plt.ylabel('出現頻度')
plt.show()
Example #34
0
import nltk
from nltk.corpus import inaugural
from nltk.tokenize import PunktSentenceTokenizer

train_text = inaugural.raw("1801-Jefferson.txt")
sample_text = inaugural.raw("1801-Jefferson.txt")

sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            namedEntity = nltk.ne_chunk(tagged)
            namedEntity.draw()
    except Exception as e:
        print(str(e))