def main(): s1=pre(inaugural.raw('2009-Obama.txt')) sx=inaugural.fileids() for file in sx: s2=pre(inaugural.raw(file)) #inter=set(s1) & set(s2) similarity1=similarity(s1,s2) print(similarity1,file)
def g(): # t = inaugural.raw("1789-Washington.txt") # print(len(t)) sdbsearch = SimpleDBSearch() for url in nltk.corpus.inaugural.fileids()[:10]: data = inaugural.raw(url) print "indexing url ", url, len(data), type(data) sdbsearch.index(url, data) print "-- writing index to sdb" sdbsearch.writeIndexToSDB()
def avgWord(): x1 = [] y1 = [] for fileid in inaugural.fileids(): words = inaugural.raw(fileids=fileid) words = words.split() average = sum(len(word) for word in words) / len(words) print(fileid[:4], "-", average) y1.append(fileid[:4]) x1.append(average) plt.title('Średnia długość słowa:') plt.xticks(rotation=90) plt.plot(y1, x1) plt.show()
def getGraphs(): index = 0 for id in inaugural.fileids(): #prob(-14) index += 1 ww = inaugural.raw(id).lower() num_war = ww.count('war') num_america = ww.count('america') num_economy = ww.count('economy') num_world = ww.count('world') plot(index, num_war, 'mo') #war plot(index, num_america, 'go') #america (increasing) plot(index, num_economy, 'ro') #ecomony plot(index, num_world, 'bo') #world (increasing) xlabel('index, purple-war, green-america, red-economy, world-blue') ylabel('the frequency of the words used') show()
def main(): ##nltk.download('reuters') nltk.download('inaugural') nltk.download('punkt') docinaug=inaugural.fileids() documents = reuters.fileids() print(str(len(documents))) print(reuters.raw("test/15556")) forwardDict,backwardsDict,probMatrix,probUniMatrix,totalProb=tokenize(reuters.raw("test/15556")) ##print(documents[1]) ##print(docinaug[1]) #forwardDict,backwardDict,probMtrx=tokenize("the man. the man. the man") sent_token=word_tokenize("hello my friend how are you") print("a") print(sentence_perplex(inaugural.raw(docinaug[1]),probMatrix,forwardDict,probUniMatrix))
def graphWords(): index = 0 for id in inaugural.fileids(): index += 1 nchar = len(inaugural.raw(id)) * 1.0 nword = len(inaugural.words(id)) * 1.0 nsent = len(inaugural.sents(id)) * 1.0 nvoc = len(set(w.lower() for w in inaugural.words(id))) * 1.0 a = nchar / nword b = nword / nsent c = nword / nvoc plot(index, a, 'mo') #purple color plot(index, b, 'go') #green color plot(index, c, 'ro') #red color xlabel( 'index, from Washington to Obama (purple - character/word), (red - word/vocab)' ) ylabel('Average numbers (green - word/sentence)') show()
def senti(): x3 = [] y3 = [] x31 = [] for fileid in inaugural.fileids(): text = inaugural.raw(fileids=fileid) senti = TextBlob(text) print(fileid[:4], "-", senti.sentiment) y3.append(fileid[:4]) x3.append(senti.sentiment[0]) x31.append(senti.sentiment[1]) plt.title('Polarity') plt.xticks(rotation=90) plt.plot(y3, x3) plt.show() plt.title('Subjectivity') plt.xticks(rotation=90) plt.plot(y3, x31) plt.show()
def __append_corpus_data(self): """ Appends data to the questions and statements files from the inaugural address corpus """ sentences = [] # Use the Presidential inaugural addresses corpus for fileid in inaugural.fileids(): raw_text = inaugural.raw(fileid) sentence_tokens = nltk.sent_tokenize(raw_text) sentences += sentence_tokens random.shuffle(sentences) random.shuffle(sentences) random.shuffle(sentences) # Write sentences to the sentences and questions files for sentence in sentences: if sentence and 10 < len(sentence) < 75: if sentence.endswith('?'): self.q_out.write(self.__strip_sentence(sentence) + '\n') else: self.s_out.write(self.__strip_sentence(sentence) + '\n')
__author__ = 'rich' import nltk from nltk.corpus import inaugural, stopwords train = inaugural.raw("1789-Washington.txt") words = train.split() words_clean = [] for word in words: if word not in stopwords.words("english"): words_clean.append(word) index = {} for word in words_clean: if word in train: if word not in index.keys(): index[word] = ['1789-Washington.txt'] elif "1789-Washington.txt" not in index[word]: index[word].append("1789-Washington.txt") print "break"
import nltk from nltk.corpus import inaugural speech = inaugural.raw('1789-Washington.txt') print speech
# coding: utf-8 # In[6]: import nltk from nltk.corpus import inaugural from nltk.tokenize import PunktSentenceTokenizer nltk.data.path.append('F:/nltk_files/') train_dataset = inaugural.raw('1789-Washington.txt') test_dataset = inaugural.raw('1793-Washington.txt') punkt_tokenizer = PunktSentenceTokenizer(train_dataset) tokenized_text = punkt_tokenizer.tokenize(test_dataset) def find_ner(): try: for i in tokenized_text: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=False) namedEnt.draw() except Exception as e: print(str(e)) find_ner() # In[ ]:
from nltk import word_tokenize from nltk.util import ngrams from nltk.corpus import inaugural file_content = inaugural.raw('2009-Obama.txt') tokens = word_tokenize(file_content) print('\nTokens List:\n') print(set(list(tokens))) length = len(list(tokens)) result = list() gramslist = ngrams(tokens, 1) dictionary = {} for gram in gramslist: if str(gram) in dictionary: dictionary[str(gram)] += 1 else: dictionary[str(gram)] = 1 print(len(dictionary))
from nltk import word_tokenize from nltk.util import ngrams from nltk.corpus import inaugural import operator file_content1 = inaugural.raw('2009-Obama.txt') file_content2 = inaugural.raw('1789-Washington.txt') tokens1 = word_tokenize(file_content1) tokens2 = word_tokenize(file_content2) length1 = len(list(tokens1)) length2 = len(list(tokens2)) gramslist1 = ngrams(tokens1, 1) gramslist2 = ngrams(tokens2, 1) dictionary1 = {} dictionary2 = {} def sort_dict(dictionary): res = sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True) return res for gram in gramslist1: if str(gram) in dictionary1: dictionary1[str(gram)] += 1 else: dictionary1[str(gram)] = 1 for gram in gramslist2: if str(gram) in dictionary2: dictionary2[str(gram)] += 1 else:
r = requests.get("http://"+url) website = r.text #Gets website content soup = BeautifulSoup(website,"lxml") #Makes the website content into a beautiful soup object with a backend lxml parser #print soup.prettify() for script in soup(["script", "style"]): #Rip out script and styling text script.extract() clean_text = soup.get_text(" ") #Obtains text from soup object separated by a space lines = (line.strip() for line in clean_text.splitlines()) # break multi-headlines into a line each and join them to get rid of conjoined words text = '\n'.join(line for line in lines) trainText = inaugural.raw("2009-Obama.txt") #Training the punkt sentence tokenizer for POS tagging custom_sent_tokenizer = PunktSentenceTokenizer(trainText) tokenized = custom_sent_tokenizer.tokenize(text) posContent = process_content(tokenized) #Processes tokenized words and passes into the process_content function to tag the words adjectives = [] #Creating an array to store the adjectives in nouns = [] #JJ = Adjectives #NN = Nouns #NNS = Plural nouns #VB = Verb for i in posContent: if i[1] == "JJ": # If the word is tagged as an adjective
rv = float(count) / len(dw) except ZeroDivisionError: rv = 1 return rv def idf(term, corpus): count = 0 for doc in corpus: if term.lower() in doc.lower(): count += 1 return math.log(1 + float(len(corpus)) / count) if __name__ == '__main__': q = ['fellow', 'citizens'] corpus = [] files = ['1789-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt'] for file in files: corpus.append(inaugural.raw(file)) corpus.append("how now brown cow") for file in corpus: tf1 = tf(q[0], file) tf2 = tf(q[1], file) print "tf: %s is %f" % (q[0], tf1) print "tf: %s is %f" % (q[1], tf2) i1 = idf(q[0], corpus) i2 = idf(q[1], corpus) print "IDF1: %s" % (i1, ) print "IDF2: %s" % (i2, )
# -*- coding: utf-8 -*- # リスト 4-1(2) 文に分割し、さらに単語に分割して数える例 import matplotlib.pyplot as plt import numpy as np import nltk from nltk.corpus import inaugural from collections import Counter sents_Washington = nltk.tokenize.sent_tokenize( inaugural.raw('1789-Washington.txt')) sents_Kennedy = nltk.tokenize.sent_tokenize(inaugural.raw('1961-Kennedy.txt')) sents_Obama = nltk.tokenize.sent_tokenize(inaugural.raw('2009-Obama.txt')) cnt_Washington = Counter(len(sent.split()) for sent in sents_Washington) cnt_Kennedy = Counter(len(sent.split()) for sent in sents_Kennedy) cnt_Obama = Counter(len(sent.split()) for sent in sents_Obama) print(sorted(cnt_Washington.items(), key=lambda x: [x[1], x[0]], reverse=True)) print(sorted(cnt_Kennedy.items(), key=lambda x: [x[1], x[0]], reverse=True)) print(sorted(cnt_Obama.items(), key=lambda x: [x[1], x[0]], reverse=True)) nstring_Washington = np.array([len(sent.split()) for sent in sents_Washington]) nstring_Kennedy = np.array([len(sent.split()) for sent in sents_Kennedy]) nstring_Obama = np.array([len(sent.split()) for sent in sents_Obama]) plt.hist([nstring_Washington, nstring_Kennedy, nstring_Obama], color=['blue', 'red', 'green'], label=['1789年ワシントン', '1961年ケネディ', '2007年オバマ']) plt.title('1789年ワシントン/1961年ケネディ/2007年オバマ就任演説の文ごとの単語数分布') plt.xlabel('文の単語数') plt.ylabel('出現頻度') plt.legend()
def benchmark(): data = inaugural.raw(nltk.corpus.inaugural.fileids()) firsterms = " ".join(data.split()[:1000]).split() for term in firsterms: q(term.lower())
VB verb, base form take VBD verb, past tense took VBG verb, gerund/present participle taking VBN verb, past participle taken VBP verb, sing. present, non-3d take VBZ verb, 3rd person sing. present takes WDT wh-determiner which WP wh-pronoun who, what WP$ possessive wh-pronoun whose WRB wh-abverb where, when ''' ####inaugural 自带数据 train_text=inaugural.raw('1789-Washington.txt') sample_text=inaugural.raw('2009-Obama.txt') #sentence tokenization custom_sent_tokenizer=PunktSentenceTokenizer(train_text) tokenized=custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words=nltk.word_tokenize(i) tagged=nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e))
import nltk from nltk.corpus import inaugural, stopwords from nltk import sent_tokenize, word_tokenize from nltk.stem import * import pprint # initialize pprint pp = pprint.PrettyPrinter(indent=4) # list of all speeches ids = inaugural.fileids() data = '1789-Washington.txt' # get speech of particular file speech = inaugural.raw(data) speech = speech.lower() # get sentences sentences = inaugural.sents(data) # sentence tokenize sent_tokens = sent_tokenize(speech) # print sentence #pp.pprint(sent_tokens) # word tokenize word_tokens = word_tokenize(speech) # print words
from string import punctuation import nltk from nltk.corpus import stopwords, inaugural from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem import PorterStemmer #Perfoming different nltk preprocessing of your data #Data We will use sample = inaugural.raw("2009-Obama.txt") #Tokenizing : Breaking down the body of text print(sent_tokenize(sample)) #Sentence Tokenizing: Breaking down by sentence print(word_tokenize(sample)) #Word Tokenizing:Breaking down body of text by words #STOP WORDS: removing grammar and prepositions that add no meaning to data stop_words = set(stopwords.words('english')) print(stop_words) stop_words = set( stopwords.words('english') + list(punctuation) + [u"'s", '""']) print(stop_words) #removing stop words from copora allwords = [] for w in word_tokenize(sample): if w not in stop_words: allwords.append(w)
ngrams_stats_tri={} ngrams_stats_bi={} ngrams_stats_bi_rev={} ngrams_stats_tri_rev={} ''' #class ngrams_stats_tri = {} ngrams_stats_bi = {} ngrams_stats_bi_rev = {} ngrams_stats_tri_rev = {} vocab = Counter() #choose sample sample1 = brown.raw() sample2 = gutenberg.raw() sample3 = inaugural.raw() sample5 = nltk.corpus.state_union.raw() sample4 = genesis.raw('english-web.txt') sample = sample1 + sample2 + sample3 + sample4 + sample5 vocab, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev, ngrams_stats_bi_rev = mainTrain( vocab, sample, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev, ngrams_stats_bi_rev) ''' with open('ngrams_stats_tri.pkl', 'w') as hfile: pickle.dump(ngrams_stats_tri, hfile) with open('ngrams_stats_bi.pkl', 'w') as hfile: pickle.dump(ngrams_stats_bi, hfile) with open('ngrams_stats_tri_rev.pkl', 'w') as hfile: pickle.dump(ngrams_stats_tri_rev, hfile) with open('ngrams_stats_bi_rev.pkl', 'w') as hfile: pickle.dump(ngrams_stats_bi_rev, hfile)
print(lm.counts[['bless']]['America']) print(lm.score('the')) print(lm.score("America", ["bless"])) train, vocab = padded_everygram_pipeline(2, state_union.sents()) lm = KneserNeyInterpolated(2) lm.fit(train, vocab) print(lm.counts['America']) print(lm.counts[['bless']]['America']) print(lm.score('the')) print(lm.score("America", ["bless"])) #EXERCISE 3 train, vocab = padded_everygram_pipeline(2, state_union.sents('1945-Truman.txt')) lm = MLE(2) lm.fit(train, vocab) print(lm.generate(100)) # Exercice 4 from neuralLG import dataset_preparation, create_model, generate_text data = inaugural.raw() X, Y, msl, total_words = dataset_preparation(data) model = create_model(X, Y, msl, total_words) text = generate_text("", 3, msl, model) print(text)
# Organised Named Entity Recognition # -*- coding: utf-8 -*- import nltk from nltk.corpus import inaugural from nltk import pos_tag from nltk.tokenize import word_tokenize from nltk.chunk import conlltags2tree from nltk.tree import Tree result = [] nltktag = nltk.ne_chunk( nltk.pos_tag(word_tokenize(inaugural.raw("1789-Washington.txt")))) for subtree in nltktag: if type(subtree) == Tree: result.append( (" ".join([Y for Y, Z in subtree.leaves()]), subtree.label())) print(result)
# The Brown corpus: #Each corpus is accessed by means of a "corpus reader" object from nltk.corpus print(str(nltk.corpus.brown).replace('\\\\', '/')) # The Penn Treebank Corpus: print(str(nltk.corpus.treebank).replace('\\\\', '/')) # The Name Genders Corpus: print(str(nltk.corpus.names).replace('\\\\', '/')) # The Inaugural Address Corpus: print(str(nltk.corpus.inaugural).replace('\\\\', '/')) print(str(nltk.corpus.treebank.fileids())) # doctest: +ELLIPSIS #print(str(nltk.corpus.inaugural.fileids()) # doctest: +ELLIPSIS # Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus. from nltk.corpus import inaugural print(inaugural.raw('1789-Washington.txt')) # doctest: +ELLIPSIS print(inaugural.words('1789-Washington.txt')) print(inaugural.sents('1789-Washington.txt')) # doctest: +ELLIPSIS print(inaugural.paras( '1789-Washington.txt')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # l1 = len(inaugural.words('1789-Washington.txt')) l2 = len(inaugural.words('1793-Washington.txt')) l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt'])) print('%s+%s == %s' % (l1, l2, l3)) print(len(inaugural.words())) print(inaugural.readme())
#Named Entity Recognition #Chunking with NLTK with the help of regular expressions #get all imports import nltk from nltk.corpus import inaugural from nltk.tokenize import PunktSentenceTokenizer #Create training and testing data train_data = inaugural.raw("1789-Washington.txt") sample_data = inaugural.raw("1793-Washington.txt") train_tokenizer = PunktSentenceTokenizer(train_data) def named_entity_recognition(): try: print([ nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(w)), binary=False).draw() for w in train_tokenizer.tokenize(sample_data) ]) except Exception as e: print(str(e)) named_entity_recognition()
print("-------WARM UP---------") print("------TASK 1---------") #using inaugural fileids to list all the documents documents = inaugural.fileids() print( "Using the corpus reader class list all the documents in inaugural corpus :" ) print(documents) print("---------------------------------------------------------------------") print("Find the total number of words in Clinton’s 1993 speech :") #using .worrds method to count words in clinton speech clintonwords = (inaugural.words('1993-Clinton.txt')) print(len(clintonwords)) #.raw method will read the text in raw form s = inaugural.raw('1789-Washington.txt') w = set(m.group(0) for m in re.finditer(r"\w+", s)) #print (len(re.findall('\w+', s))) print("Find the total number of distinct words in the same speech :") #now we will find length of distinct words print(len(w)) # average function to calculate average word length def average(numbers): return sum(numbers) / len(numbers) lengths = [len(word) for word in clintonwords] print('Find the average word type length of same speech.:') print(average(lengths))
import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS import nltk nltk.download('inaugural') nltk.download('gutenberg') nltk.download('nps_chat') nltk.download('webtext') nltk.download('treebank') from nltk.corpus import inaugural text = inaugural.raw() wordcloud = WordCloud(max_font_size=60).generate(text) plt.figure(figsize=(16, 12)) # plot wordcloud in matplotlib plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show() from nltk.book import text4 as inaugural_speeches plt.figure(figsize=(16, 5)) topics = ['sports', 'news', 'Government'] inaugural_speeches.dispersion_plot(topics) from nltk.corpus import brown stop_words = set(STOPWORDS) topics = ['Sports - الرياضة', 'News - الاخبار', 'Government - السياسة'] for topic in topics: words = [ word for word in brown.words(categories=topic) if word.lower() not in stop_words and word.isalpha()
print(gramslist) print('\n\n') print(freq.items()) print('\n\n') problist = [(i, freq[i] / len(gramslist)) for i in gramslist] print('Probability-', problist) file_content = open("input_text.txt").read() tokens = word_tokenize(file_content) print('\nTokens List:\n') print(tokens) get_ngram(tokens, 3) obwords = word_tokenize(inaugural.raw('2009-Obama.txt')) waswords = word_tokenize(inaugural.raw('1789-Washington.txt')) print('\n\nOBAMA') ob = FreqDist(obwords) print('No. of words:', len(obwords)) print('No. of distinct words:', len(ob.keys())) sortob = sorted(ob.items(), key=lambda x: x[1]) print('\n\nOBAMA50-', sortob[-50:]) was = FreqDist(waswords) sortwas = sorted(was.items(), key=lambda x: x[1]) print('\n\nWASHINGTON0-', sortob[-50:], '\n\n') obuni = FreqDist(list(ngrams(obwords, 1))) obbi = FreqDist(list(ngrams(obwords, 2)))
from nltk.corpus import inaugural, stopwords from nltk import word_tokenize from string import punctuation stop_words = set(stopwords.words('english')) | set(punctuation) obamatext = inaugural.raw("2009-Obama.txt") obamalist = word_tokenize(obamatext) obamalist = [word for word in obamalist if word not in stop_words] def getNGrams(input_list, n): return [ ' '.join(input_list[i:i + n]) for i in range(len(input_list) - (n - 1)) ] def getallNGram(input_list): allngrams = dict() for i in range(1, 5): allngrams[str(i) + "_gram"] = getNGrams(input_list, i) return allngrams def MostCommon(ngrams): for k in ngrams.keys(): d = dict() for v in ngrams[k]: d[v] = ngrams[k].count(v) print(sorted(d, key=lambda x: d[x], reverse=True)[:5], '\n')
import nltk import itertools # importing the inaugural corpus from nltk.corpus import inaugural # importing stopwords from nltk.corpus import stopwords # importing Frequency distribution from nltk.probability import FreqDist words = inaugural.words('1993-Clinton.txt') speech = inaugural.raw('1993-Clinton.txt') stop_words = set(stopwords.words('english')) # listing all the documents in the inaugural corpus # print(inaugural.fileids()) # the number of words in clintons 1993 speech print('Total number of words in the given text is:', len(inaugural.words('1993-Clinton.txt'))) # unique words in the speech unique_words = sorted(set(inaugural.words('1993-Clinton.txt'))) print('Number of unique words in the given text is:', len(unique_words)) # function that returns the avg length of words in a word list. def avg_word_length(word_list):
def re(corpus): myRE1 = RAKE_tagged(100, stopwords='auto', pos=["N", "VBP", "R"]) #100 myRE = RAKE_tagged(80, stopwords='auto', pos=["N"]) #30 summary = myRE1.transform(corpus, output_type="s") summaries = ["; ".join(s) for s in summary] keywords = myRE.transform(summaries, output_type="w") CF = Concept_finder() CF.fit(myRE1.finaltext) arr = CF.transform(keywords) CL = Clustering(arr, CF.model) CL.fit_transform(20) #50 CL.visualize() #load the data #df= pd.read_csv('../input/stage2_test_text.csv', sep='\|\|', header=None, skiprows=1, names=["ID","Text"]) from nltk.corpus import inaugural sample = [{ 'ID': fileid, 'Text': inaugural.raw(fileid) } for fileid in inaugural.fileids()] df = pd.DataFrame(sample) #df.head() df_txt = df['Text'] re(df_txt)
from nltk.corpus import inaugural, reuters, brown, gutenberg from itertools import product as iter_product def words(text): return re.findall('[a-z]+', text.lower()) def train(features): model = collections.defaultdict(lambda: 1) for f in features: model[f] += 1 return model NWORDS = train(words(inaugural.raw() + reuters.raw() + brown.raw() + gutenberg.raw())) alphabet = 'abcdefghijklmnopqrstuvwxyz' def edits1(word): splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [a + b[1:] for a, b in splits if b] transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] inserts = [a + c + b for a, b in splits for c in alphabet] return set(deletes + transposes + replaces + inserts) def known_edits2(word): return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # リスト 4-2 文書全体を単語に分解し、出現頻度を数えるプログラム例 from collections import Counter import matplotlib.pyplot as plt import numpy as np import nltk from nltk.corpus import inaugural from collections import Counter sents = nltk.tokenize.sent_tokenize(inaugural.raw('1789-Washington.txt')) cnt = Counter(len(sent.split()) for sent in sents) print(sorted(cnt.items(), key=lambda x: [x[1], x[0]], reverse=True)) nstring = np.array([len(sent.split()) for sent in sents]) plt.hist(nstring) plt.title('1789年ワシントン就任演説の文ごとの単語数分布') plt.xlabel('文の単語数') plt.ylabel('出現頻度') plt.show()
import nltk from nltk.corpus import inaugural from nltk.tokenize import PunktSentenceTokenizer train_text = inaugural.raw("1801-Jefferson.txt") sample_text = inaugural.raw("1801-Jefferson.txt") sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEntity = nltk.ne_chunk(tagged) namedEntity.draw() except Exception as e: print(str(e))