def run(): # Dataset generation cls = ['Obama', 'Trump'] obama_sentences = inaugural.sents('2009-Obama.txt') trump_sentences = inaugural.sents('2017-Trump.txt') labelled_obama = [(s, cls[0]) for s in obama_sentences] labelled_trump = [(s, cls[1]) for s in trump_sentences] labelled_data = labelled_obama + labelled_trump trump_test = [ 'We', ',', 'the', 'citizens', 'of', 'America', ',', 'are', 'now', 'joined', 'in', 'a', 'great', 'national', 'effort', 'to', 'rebuild', 'our', 'country', 'and', 'restore', 'its', 'promise', 'for', 'all', 'of', 'our', 'people', '.' ] obama_test = [ 'I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task', 'before', 'us', ',', 'grateful', 'for', 'the', 'trust', 'you', 'have', 'bestowed', ',', 'mindful', 'of', 'the', 'sacrifices', 'borne', 'by', 'our', 'ancestors', '.' ] model = Model(labelled_data, cls) model.train() while True: inp = input("Input a string to test: ") doc = tokenize(inp) print(model.test_doc(doc))
def sent_length(): text_file = str(input("Enter the name of a text file : \n")) txt_fl = inaugural.sents(text_file) print(len(txt_fl)) file_name = inaugural.fileids() print(len(inaugural.sents(file_name)))
def avgSent(): x2 = [] y2 = [] for fileid in inaugural.fileids(): average = sum(len(sent) for sent in inaugural.sents(fileids=[fileid])) / len( inaugural.sents(fileids=[fileid])) print(fileid[:4], "-", average) y2.append(fileid[:4]) x2.append(average) plt.title('Średnia długość zdania:') plt.xticks(rotation=90) plt.plot(y2, x2) plt.show()
def get_sentences(): '''获得语料库中的句子,输出成sectence''' '''需要调用这个函数''' articles = inaugural.fileids() sentences = [] for i in articles: article = inaugural.sents(i) sentences = sentences + list(article) return sentences
def get_inaugural_docs(download=False) -> List[List[List[str]]]: """ Get the inaugural documents as a list (documents) of list (sentences) of list (sentence) of strings (words) :param download: If True, the corpus will be downloaded. Default=False :return: """ if download: nltk.download('inaugural') return [[[w.lower() for w in sent] for sent in inaugural.sents(fileid)] for fileid in inaugural.fileids()]
def main(): # @BEGIN normalize_list # @IN inaugural @URI file:data/inaugural/{year}-{president}.txt # @OUT normalized_addresses file_ids = inaugural.fileids() print(file_ids) normalized_addresses = [] for address in file_ids: normalized_words = [address.split("-")[0]] for sent in inaugural.sents(address): prev_word = "" for word in sent: if(prev_word == "'"): continue normalized = re.sub("[^a-z0-9]", "", word.lower()) if(normalized != ""): normalized_words.append(normalized) prev_word = word normalized_addresses.append(normalized_words) # @END normalized_list # @BEGIN pickleize # @IN normalized_addresses # @OUT pkl @URI file:data/norm_addresses.pkl fout = open("norm_addresses.pkl", "wb") pickle.dump(normalized_addresses, fout) fout.close() # @END pickleize # deserialize pkl file # @BEGIN depickleize # @IN pkl @URI file:data/norm_addresses.pkl # @OUT address_word_list fin = open("norm_addresses.pkl", "rb") address_word_list = pickle.load(fin) fin.close() # @END depickleize # @BEGIN frequency # @IN address_word_list # @IN search_word # @OUT frequency_maps search_word = input("Input word to find frequency: ") frequency_maps = {} for word_list in address_word_list: frequency_maps[word_list[0]] = calculate_frequency_map(word_list[1:]) # @END frequency generate_plot(search_word, frequency_maps)
def read_address(address): ''' Reads the given nltk inaugural address to a string ''' full_address = "" #join all the words in each sentence for sent in inaugural.sents(address): sent = ' '.join(sent) full_address = full_address + sent + '\n' return full_address
def write_files(): import nltk import re import pickle files = nltk.corpus.inaugural.fileids() from nltk.corpus import inaugural masterList = list() for i in files: sentences = inaugural.sents(i) sentLst = [' '.join(sent) + '\n' for sent in sentences] theString = str(sentLst) fixedString = tokenize(theString) masterList.append(fixedString) fout = open('proj3.pkl', 'wb') pickle.dump(masterList, fout) fout.close()
def get_default_sentences() -> list: nltk.download('brown') brown_tokenized_sentences = brown.sents() brown_sentences = detok_sentences(brown_tokenized_sentences) nltk.download('gutenberg') nltk.download('punkt') gutenberg_tokenized_sentences = gutenberg.sents() gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences) nltk.download('reuters') reuters_tokenized_sentences = reuters.sents() reuters_sentences = detok_sentences(reuters_tokenized_sentences) nltk.download('webtext') webtext_tokenized_sentences = webtext.sents() webtext_sentences = detok_sentences(webtext_tokenized_sentences) nltk.download('inaugural') inaugural_tokenized_sentences = inaugural.sents() inaugural_sentences = detok_sentences(inaugural_tokenized_sentences) return brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences
def graphWords(): index = 0 for id in inaugural.fileids(): index += 1 nchar = len(inaugural.raw(id)) * 1.0 nword = len(inaugural.words(id)) * 1.0 nsent = len(inaugural.sents(id)) * 1.0 nvoc = len(set(w.lower() for w in inaugural.words(id))) * 1.0 a = nchar / nword b = nword / nsent c = nword / nvoc plot(index, a, 'mo') #purple color plot(index, b, 'go') #green color plot(index, c, 'ro') #red color xlabel( 'index, from Washington to Obama (purple - character/word), (red - word/vocab)' ) ylabel('Average numbers (green - word/sentence)') show()
def main(): synset_group = [] word_group = [] president_Synset_usage = [] f = open("keyword_group.txt", 'r') while True: synset = f.readline() if not synset: break synset = synset.strip() if synset == 'group': synset_group.append(set()) else: synset_group[-1].add(synset) f.close() group_size = len(synset_group) for g in synset_group: word_group.append(syn_to_lem(g)) for fileid in inaugural.fileids(): corpus = clean_corpus(inaugural.sents(fileid)) total_len = sum(map(len, corpus)) Group_usage = np.zeros((1, group_size)) for sent in corpus: for i in range(group_size): keywords = word_group[i] check = set(filter(lambda x: x in sent, keywords)) Group_usage[0][i] += len(check) Group_usage /= total_len president_Synset_usage.append((fileid[:-4], Group_usage)) #for f in president_Synset_usage: print(f) learnable = [] for f in president_Synset_usage[-15:]: learnable.append(f[1]) data = concat_all(learnable) label = np.array([[ 4.65, 5.05, 2.86, 2.58, 3.24, 3.14, 3.82, 2.25, 3.31, 4.45, 2.35, 2.03, 1.46, 2.19, 2.48 ]]) label = label.T #best_lambda(3,data,label) train_acc = [] l_list = [] idx = np.arange(group_size) np.random.seed(6) np.random.shuffle(idx) for i in range(group_size): dt = data.T #dt = dt[idx[:i+1]] dt = dt[:i + 1] #train, test = k_fold(3,dt.T,label) train, test, l = best_lambda(3, dt.T, label) train_acc.append(train) l_list.append(l) print('\nl: ', l_list) print('\ntrain: ', train_acc) x_range = np.arange(group_size) plt.plot(x_range, train_acc, c='k') plt.xlabel('number of seed words') plt.ylabel('R2 loss') plt.show()
# === Part 1: Importing Corpuses === import nltk from nltk.corpus import inaugural print inaugural.fileids() #Run your file.You should see all the text files containing all the speeches of the US presidents that the #NLTK has saved inside it. #Now add the lines: print "=============Words in Obama's Speech ======" print inaugural.words( '2009-Obama.txt') #Returns a list of all the words in Obama's speech print "=============Words in Bush's speech ======" print inaugural.sents( '2005-Bush.txt') #Returns a list of all the sentences in Bush's speech #As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech. #Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech. # === Part 2: Analysing tokens (words) of a text === #The term 'token' means a word or a punctuation mark. #After you've done that, add the following lines to your program from nltk.book import * #This may take a while to load. NLTK has many texts stored in it! #Once its loaded type:
""" Train a Word2Vec model, and print the most similar words to "war" """ import warnings warnings.filterwarnings("ignore") from typing import List, Dict, Tuple import hashlib import nltk from nltk.corpus import inaugural from gensim.models import Word2Vec # Pass this hashfxn to gensim's Word2Vec. def hashfxn(s) -> int: return int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % 10 ** 8 sentences = inaugural.sents() model = Word2Vec(sentences=sentences, size=100, workers=1, hashfxn=hashfxn, iter=10) print("5 most similar words to war:") print(model.most_similar("war", topn=5)) # Print some more
#!/usr/bin/env python # coding: utf-8 # In[4]: from nltk.corpus import inaugural # In[5]: inaugural.words('1789-Washington.txt') # In[6]: inaugural.raw('1789-Washington.txt') # In[7]: inaugural.sents('1789-Washington.txt') # In[8]: inaugural.paras('1789-Washington.txt') # In[1]: from nltk.corpus import wordnet as wn wn.synsets('dog') # In[ ]:
# print(list(ngrams(random_text, 5))) # print(list(ngrams(random_words, 5))) # Google n-gram viewer # random_sentence = inaugural.sents('2009-Obama.txt')[1] # for trg in ngrams(random_sentence, 3): # print(trg) # print(list(bigrams(random_sentence, pad_right=True))) # for trg in (ngrams(random_sentence, 4, # pad_right=True, right_pad_symbol='</s>', # pad_left=True, left_pad_symbol='<s>')): # print(trg) target_speeches = ['1789-Washington.txt', '1861-Lincoln.txt', '2001-Bush.txt'] ngr = {} # tokens for text in target_speeches: data = inaugural.words(text) for trg in (ngrams(data, 2)): if trg not in ngr: ngr[trg] = 1 else: ngr[trg] += 1 ngr = sorted(ngr.items(), key=lambda kv: kv[1], reverse=True) # sentences bigr_of_text = [] for text in target_speeches: data = inaugural.sents(text) bigr_of_text += (ngrams(data, 2)) print(bigr_of_text) # a = FreqDist(bigr_of_text) # print(a.most_common(5))
import pprint # initialize pprint pp = pprint.PrettyPrinter(indent=4) # list of all speeches ids = inaugural.fileids() data = '1789-Washington.txt' # get speech of particular file speech = inaugural.raw(data) speech = speech.lower() # get sentences sentences = inaugural.sents(data) # sentence tokenize sent_tokens = sent_tokenize(speech) # print sentence #pp.pprint(sent_tokens) # word tokenize word_tokens = word_tokenize(speech) # print words #pp.pprint(word_tokens) stop_words = set(stopwords.words('english')) #pp.pprint(stop_words)
book_sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: book_sentences.append(sentence_to_wordlist(raw_sentence)) #print(raw_sentences[5]) #print(book_sentences[5]) conll2000_corp_sents = conll2000.sents() print("condll2000 to sents") conll2002_corp_sents = conll2002.sents() print("conll2002 to sents") conll2007_corp_sents = conll2007.sents() print("condll2007 to sents") inaugural_corp_sents = inaugural.sents() print("inaugural to sents") abc_corp_sents = abc.sents() print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents") movie_reviews_corp_sents = movie_reviews.sents() print("Movie reviews to sents ")
from nltk.lm.preprocessing import pad_both_ends from nltk.lm.preprocessing import flatten from nltk.lm.preprocessing import padded_everygram_pipeline from nltk.lm import MLE from nltk.lm import Lidstone from nltk.lm import Laplace from nltk.lm import KneserNeyInterpolated # Exercise 1 president_unigrams = {} for president in inaugural.fileids(): text_unigrams = [ngrams(sent, 1) for sent in inaugural.sents(president)] ngram_counts = NgramCounter(text_unigrams) president_unigrams[president] = ngram_counts.N() inverse_unigrams = [(value, key) for key, value in president_unigrams.items()] print(max(inverse_unigrams)[1], max(inverse_unigrams)[0]) #longest discourse for Harrison in 1841 print(min(inverse_unigrams)[1], min(inverse_unigrams)[0]) #shortest discourse for Washington in 1793 president_vocabulary = {} for president in inaugural.fileids(): vocab = Vocabulary(inaugural.words(president), unk_cutoff=2) president_vocabulary[president] = len(vocab)
from nltk.util import bigrams, trigrams, ngrams from nltk.corpus import inaugural from nltk import FreqDist speech_wash = list(inaugural.sents('1789-Washington.txt')) speech_adams = list(inaugural.sents('1797-Adams.txt')) speech_lincoln = list(inaugural.sents('1861-Lincoln.txt')) wash = [] for i in speech_wash: wash_b = list( bigrams(i, pad_right=True, right_pad_symbol='</s>', pad_left=True, left_pad_symbol='<s>')) wash.extend(wash_b) adams = [] for a in speech_adams: adams_b = list( bigrams(a, pad_right=True, right_pad_symbol='</s>', pad_left=True, left_pad_symbol='<s>')) adams.extend(adams_b) lincoln = [] for l in speech_lincoln: lincoln_b = list(
print("\nPart 3:") fd = FreqDist(text1) print("Amount of times 'the' appears in text 1: \n" + str(fd['the'])) print(fd.keys()) print(fd.items()) print("End Part 3\n") # === Part 4: Your task ===# print("\nPart 4:") # 1.) Returns the 10 most frequent words in Obama's 2009 inaugural speech, including their frequencies fd = FreqDist(inaugural.words('2009-Obama.txt')) print(fd.most_common(10)) # 2.) Calculates the Lexical Richness of his speech print(len('2009-Obama.txt') / len(set('2009-Obama.txt'))) # 3.) Calculate the average length of the sentences in his speech print(len(inaugural.sents('2009-Obama.txt'))) print("\nOptional Part\n") # 4.) Write a separate function called sent_length that takes in a string for the name of a text like '2009-Obama.txt', then finds the average # length of the sentences in that speech. Compute the average length of sentences from the year of the first speech (1789) to the year of # Obama's 2009 speech and see how the average length of sentences has changed over the time of the entire US's history. Remember that # inaugural.fileids() will give you a list of all the String names of the speeches so you don't need to work out each name of each speech. def sent_length(): text_file = str(input("Enter the name of a text file : \n")) txt_fl = inaugural.sents(text_file) print(len(txt_fl)) file_name = inaugural.fileids() print(len(inaugural.sents(file_name)))
#Each corpus is accessed by means of a "corpus reader" object from nltk.corpus print(str(nltk.corpus.brown).replace('\\\\', '/')) # The Penn Treebank Corpus: print(str(nltk.corpus.treebank).replace('\\\\', '/')) # The Name Genders Corpus: print(str(nltk.corpus.names).replace('\\\\', '/')) # The Inaugural Address Corpus: print(str(nltk.corpus.inaugural).replace('\\\\', '/')) print(str(nltk.corpus.treebank.fileids())) # doctest: +ELLIPSIS #print(str(nltk.corpus.inaugural.fileids()) # doctest: +ELLIPSIS # Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus. from nltk.corpus import inaugural print(inaugural.raw('1789-Washington.txt')) # doctest: +ELLIPSIS print(inaugural.words('1789-Washington.txt')) print(inaugural.sents('1789-Washington.txt')) # doctest: +ELLIPSIS print(inaugural.paras( '1789-Washington.txt')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # l1 = len(inaugural.words('1789-Washington.txt')) l2 = len(inaugural.words('1793-Washington.txt')) l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt'])) print('%s+%s == %s' % (l1, l2, l3)) print(len(inaugural.words())) print(inaugural.readme())
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # import nltk # nltk.download('inaugural') import os from nltk.corpus import inaugural corpus_from_paragraphs = inaugural.paras(os.path.dirname(__file__) + '/dataset/paragraphs.txt') corpus_from_sentences = inaugural.sents(os.path.dirname(__file__) + '/dataset/sentences.txt') corpus_from_words = inaugural.words(os.path.dirname(__file__) + '/dataset/words.txt') l1 = len(corpus_from_paragraphs) l2 = len(corpus_from_sentences) l3 = len(corpus_from_words) # l2 = 0 # l3 = 0 print('paragraphs: %s, sentences: %s, words: %s' % (l1, l2, l3)) # print(inaugural.readme())
compared_word = list(vec_dict.keys())[arg] if compared_word != word: print(compared_word, ' ' * (14-len(compared_word)) , pairwise_sims[0][arg]) print() def random_similarwords(examples=10): for _ in range(examples): word = choice(list(vec_dict.keys())) mostsimilar(word) if __name__ == "__main__": # Get vocabulary & ngrams window_size = 5 stopwords = stopwords.words('english') lmtzer = WordNetLemmatizer() filtered_sents = [tuple(lmtzer.lemmatize(word.lower()) \ for word in sent if word.isalnum() and not word in stopwords) \ for sent in inaugural.sents()] n_grams = get_ngrams(filtered_sents) vocab = set(word for sent in filtered_sents for word in sent) vec_dict = get_word_vecs() mostsimilar('demoralizes') print(for i in vec_dict['demoralizes']) for sent in filtered_sents: if 'demoralizes' in sent: print(sent) # random_similarwords(examples=10)
word_count_total = len(inaugural.words(speech)) print(speech , word_count_total) #Go through all speech speech_length = [(len(inaugural.words(speech)), speech)for speech in inaugural.fileids()] print(speech_length) #Get the max and min speech print("Max is : ",max(speech_length)) print("Min is : ",min(speech_length)) #Avg no of words per sentence for each speech for speech in inaugural.fileids(): word_total = len(inaugural.words(speech)) Sents_total = len(inaugural.sents(speech)) print((word_total/Sents_total),speech) #Creating a Data Frame of the Speech data = pd.DataFrame([int(speech[:4]), len(inaugural.words(speech))/len(inaugural.sents(speech))] for speech in inaugural.fileids()) print(data.head()) #Adding Column Names data.columns = ["Year","Avg WPS"] print(data) #Use Matplotlib data.plot("Year", figsize=(15,5))
# === Part 1: Importing Corpuses === import nltk from nltk.corpus import inaugural print inaugural.fileids() # Run your file.You should see all the text files containing all the speeches of the US presidents that the # NLTK has saved inside it. # Now add the lines: print "=============Words in Obama's Speech ======" print inaugural.words("2009-Obama.txt") # Returns a list of all the words in Obama's speech print "=============Words in Bush's speech ======" print inaugural.sents("2005-Bush.txt") # Returns a list of all the sentences in Bush's speech # As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech. # Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech. # === Part 2: Analysing tokens (words) of a text === # The term 'token' means a word or a punctuation mark. # After you've done that, add the following lines to your program from nltk.book import * # This may take a while to load. NLTK has many texts stored in it! # Once its loaded type:
total_word = len(inaugural.words(speech)) print(str(total_word) + " Title: " + speech) # if you find that output as list. I generally used list comprehension speech_len = [(len(inaugural.words()), speech) for speech in inaugural.fileids()] print(speech_len) print(max(speech_len)) print(min(speech_len)) # Find out the average no of words per sentence for speech in inaugural.fileids(): words_total = len(inaugural.words(speech)) sents_total = len(inaugural.sents(speech)) avg_word_per_sents = words_total / sents_total print(avg_word_per_sents, speech) # Best way to show this information as plot or visualize . # Making the data frame using the python data = pd.DataFrame([ int(speech[:4]), len(inaugural.words(speech)) / len(inaugural.sents(speech)) ] for speech in inaugural.fileids()) data.columns = ["Year", "Average WPS"] print(data.head(10)) plt.interactive(False) data.plot("Year", figsize=(15, 5))
sentences.append(' '.join(tok_sent).strip()) return sentences print("Loading sentences.") nltk.download('brown') brown_tokenized_sentences = brown.sents() brown_sentences = detok_sentences(brown_tokenized_sentences) nltk.download('gutenberg') nltk.download('punkt') gutenberg_tokenized_sentences = gutenberg.sents() gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences) nltk.download('reuters') reuters_tokenized_sentences = reuters.sents() reuters_sentences = detok_sentences(reuters_tokenized_sentences) nltk.download('webtext') webtext_tokenized_sentences = webtext.sents() webtext_sentences = detok_sentences(webtext_tokenized_sentences) nltk.download('inaugural') inaugural_tokenized_sentences = inaugural.sents() inaugural_sentences = detok_sentences(inaugural_tokenized_sentences) all_sentences = brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences outfile = codecs.open('output.txt', 'w') for sentence in all_sentences: cleaned_sentence = sentence.replace(" ' s ", "'s ") cleaned_sentence = cleaned_sentence.replace("n ' t ", "n't ") cleaned_sentence = cleaned_sentence.replace(" ,", ",") cleaned_sentence = cleaned_sentence.replace(" .", ".") outfile.write('{}\n'.format(cleaned_sentence))