def exercise_gutenberg(): # 打印古腾堡项目的文件列表 print gutenberg.fileids() # 挑选一个文本: 简-奥斯丁的《爱玛》 emma = gutenberg.words("austen-emma.txt") # 查看书的长度 print len(emma) # 导入文本 emma_text = nltk.Text(emma) emma_text.concordance("surprize") for file_id in gutenberg.fileids(): chars_list = gutenberg.raw(file_id) words_list = gutenberg.words(file_id) sents_list = gutenberg.sents(file_id) # 统计文件的总字符数 num_chars = len(chars_list) # 统计文件的总单词数 num_words = len(words_list) # 统计文件的总句子数 num_sents = len(sents_list) # 统计文件的非重复单词数 num_vocab = len(set([w.lower() for w in words_list])) # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名 print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
def get_book_sents(word_list): """Searches Jane Austen's 'Emma' for the words in the word list. The sentences are modified to highlight the found words by changing them to uppercase. Then the sentence number (in order from the book) is appended to the front of the sentence string. Returns a list of strings (sentence # + \s + sentence string). """ book = 'austen-emma.txt' book_sents = gutenberg.sents(book) sent_nums = set() sents_to_return = [] s_count = 0 for s in book_sents: s_count += 1 s_str = " ".join(s) for w in word_list: if ' '+w+' ' in s_str.lower(): if s_count not in sent_nums: sent_nums.add(s_count) s_str = s_str.replace(' '+w+' ', ' '+w.upper()+' ') s_str = s_str.replace(' '+w.title()+' ', ' '+w.upper()+' ') sents_to_return.append(str(s_count)+' '+s_str) else: s_str = s_str.replace(' '+w+' ', ' '+w.upper()+' ') s_str = s_str.replace(' '+w.title()+' ', ' '+w.upper()+' ') sents_to_return[-1] = str(s_count)+' '+s_str return sents_to_return
def main(num_couplets, num_syllables, rhyme_depth): for text in TEXTS: for sentence in gutenberg.sents(text): addSentence(sentence, rhyme_depth) for couplet_number in range(0, num_couplets): # Get a randomly selected couplet attempts = 0 while True: couplet = getCouplet(num_syllables) if couplet is not None: break # Prevent an infinite loop if parameters are off attempts += 1 if attempts == 1000: return couplet = [ pretty(line) for line in couplet ] # A little hack for adjusting punctuation and capitalization couplet[0] = couplet[0][0].upper() + couplet[0][1:] if couplet[0][-1] == '.' or couplet[0][-1] == ',': couplet[0] = couplet[0][:-1] + ',' char = couplet[1][0].lower() if couplet[1][:2] != 'I ' else 'I' couplet[1] = char + couplet[1][1:] else: couplet[1] = couplet[1][0].upper() + couplet[1][1:] # Dump to stdout print couplet[0] print couplet[1]
def gutenberg(): from nltk.corpus import gutenberg for t in gutenberg.fileids(): num_chars = len(gutenberg.raw(t)) num_words = len(gutenberg.words(t)) num_sents = len(gutenberg.sents(t)) num_vocab = len(set([w.lower() for w in gutenberg.words(t)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
def tagged_sentences( book ): ''' Generator yielding one sentence at a time, filtering out the -NONE- tagged sentences, which are anomalies in the words. ''' for sentence in gutenberg.sents( book ): yield filter( lambda x: x[1] not in [':', '-NONE-', ], nltk.pos_tag( sentence ) )
def plot_sentiment_flow(title): sents = gutenberg.sents(title) positive_flow = [partial_sentiment(x) for x in sents] negative_flow = [partial_sentiment(x, positive = False) for x in sents] plt.plot(range(len(sents)), positive_flow, label = 'Positive') plt.plot(range(len(sents)), negative_flow, label = 'Negative') plt.ylabel('Sentiment Score') plt.xlabel(title) plt.show()
def gutenberg(): emma = nltk.corpus.gutenberg.words('austen-emma.txt') print len(emma) print gutenberg.fileids() emma = gutenberg.words('austen-emma.txt') macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) [s for s in macbeth_sentences if len(s) == longest_len] for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
def structure(): raw = gutenberg.raw("burgess-busterbrown.txt") raw[1:20] words = gutenberg.words("burgess-busterbrown.txt") words[1:20] sents = gutenberg.sents("burgess-busterbrown.txt") sents[1:20]
def page59(): """Prints the longest sentence from Macbeth""" from nltk.corpus import gutenberg macbeth_sentences = gutenberg.sents("shakespeare-macbeth.txt") print "macbeth_sentences=", macbeth_sentences print "macbeth_sentences[1037]=", macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) print "longest sentence=", print [s for s in macbeth_sentences if len(s) == longest_len]
def create_model_from_NLTK(): filepath = "nltkcorpus.txt" if isfile(filepath): return create_model(filepath= filepath, save=False) else: from nltk.corpus import reuters, brown, gutenberg sents = reuters.sents() + brown.sents() for gsents in [gutenberg.sents(fid) for fid in gutenberg.fileids()]: sents += gsents return create_model(sentences=sents, savename=filepath)
def page57(): """Statistics from the Gutenberg corpora""" from nltk.corpus import gutenberg for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars / num_words), int(num_words / num_sents), print int(num_words / num_vocab), fileid
def for_print(): ''' 显示每个文本的三个统计量 :return: ''' for fileid in gutenberg.fileids(): num_chars=len(gutenberg.raw(fileid)) num_words=len(gutenberg.words(fileid)) num_sents=len(gutenberg.sents(fileid)) num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
def fun02(): """fun02""" for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) # average word length average sentence length print int(num_chars/num_words), int(num_words/num_sents), # number of times each vocabulary item appers in the text print int(num_words/num_vocab), fileid
def train(self): self.vocabulary=set() this_bigrams=[] self.unigrams = FreqDist([]) for fileid in gutenberg.fileids(): for sentence in gutenberg.sents(fileid): words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",] this_bigrams += bigrams(words) self.vocabulary.update(words) self.unigrams.update(words) self.bigrams=ConditionalFreqDist(this_bigrams) self.V = len(self.vocabulary)
def benchmark_sbd(): ps = [] rs = [] f1s = [] c = 0 for fileid in gutenberg.fileids(): c += 1 copy_sents_gold = gutenberg.sents(fileid) sents_gold = [s for s in copy_sents_gold] for sent_i in range(len(sents_gold)): new_sent = [w for w in sents_gold[sent_i] if w.isalpha()] sents_gold[sent_i] = new_sent text = gutenberg.raw(fileid) sents_obtained = split_text(text) copy_sents_obtained = sents_obtained.copy() for sent_i in range(len(sents_obtained)): new_sent = [w.group() for w in re.finditer(r'\w+', sents_obtained[sent_i]) if w.group().isalpha()] sents_obtained[sent_i] = new_sent c_common = 0 for sent in sents_obtained: if sent in sents_gold: c_common += 1 p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold)) print('\n\n', fileid) print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1)) ps.append(p) rs.append(r) f1s.append(f1) print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps), np.std(ps))) print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs), np.std(rs))) print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s), np.std(f1s))) print(len(f1s)) good_ps = [p for p in ps if p >= 0.8] good_rs = [r for r in rs if r >= 0.8] good_f1s = [f1 for f1 in f1s if f1 >= 0.8] print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps), np.std(good_ps))) print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs), np.std(good_rs))) print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s), np.std(good_f1s))) print(len(good_f1s))
def tokenize_data(self, n = -1): # download dependent nltk resources if you havn't. # nltk.download('punkt') # Read the data and append SENTENCE_START and SENTENCE_END tokens print "Reading sentences from gutenberg corpus ..." from nltk.corpus import gutenberg tokenized_sentences = [] for s in gutenberg.sents('austen-emma.txt'): tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token]) print "Parsed %d sentences." % (len(tokenized_sentences)) if n > 0: tokenized_sentences = tokenized_sentences[:n] # count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) self.vocabulary_size = int(len(word_freq.items()) * 0.95) # get the most common words, treat others words as unknown. vocab = word_freq.most_common(self.vocabulary_size - 1) print "Using vocabulary size %d." % self.vocabulary_size print "The least frequent word is '%s' and appeared %d times." % \ (vocab[-1][0], vocab[-1][1]) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(self.unknown_token) self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)]) # replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in self.word_to_index else self.unknown_token for w in sent] # create training data x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) print "" print "Example sentence: '%s'" % tokenized_sentences[0] print "By word indexes: '%s'" % \ [self.word_to_index[w] for w in tokenized_sentences[0]] return (x_train, y_train)
def get_gutenberg_data(self): count = {} self.len_list = [] #my_fileids = ['austen-sense.txt', 'austen-emma.txt', 'austen-persuasion.txt'] #my_fileids = ['chesterton-ball.txt', 'chesterton-ball.txt', 'chesterton-thursday.txt'] my_fileids =['shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt'] for fileids in my_fileids: for sent in gutenberg.sents(fileids): l = len(sent) #if l < 3: #continue self.len_list.append(l) if l in count: count[l] += 1 else: count[l] = 1 total = len(self.len_list) for i in range(100): if i in count.keys(): self.probs.append(count[i]/(total+0.0)) else: self.probs.append(0)
def test_train_selection( choice): # For Dataset combination selection S1,S2,S3,S4 total_sent1 = list(brown.sents()) total_sent2 = list(gutenberg.sents()) # Start and end of sentence tagging for sent in total_sent1: sent.insert(0, "<s>") sent.insert(len(sent), "</s>") for sent2 in total_sent2: sent2.insert(0, "<s>") sent2.insert(len(sent2), "</s>") train1, test1 = train_test_split(total_sent1, test_size=0.1, random_state=4) train2, test2 = train_test_split(total_sent2, test_size=0.1, random_state=4) #optimal discount values are being passed which had been calculated through held out data if choice == 1: # D1-train , D1- test LM_model_kneser(train1, test1, 5500, 0.8) LM_model_katz(train1, test1, 5500, 0.75) LM_model_trigram(train1, test1, 5500) elif choice == 2: # D2-train , D2- test LM_model_kneser(train2, test2, 5500, 0.8) LM_model_katz(train2, test2, 5500, 0.75) LM_model_trigram(train2, test2, 5500) elif choice == 3: # D1 + D2 train , D1- test LM_model_kneser(train1 + total_sent2, test1, 5500, 0.75) LM_model_katz(train1 + total_sent2, test1, 5500, 0.6) LM_model_trigram(train1 + total_sent2, test1, 5500) else: # D1 + D2 train , D2- test LM_model_kneser(total_sent1 + train2, test2, 5500, 0.8) LM_model_katz(total_sent1 + train2, test2, 5500, 0.6) LM_model_trigram(total_sent1 + train2, test2, 5500)
def text_cleaning(): brown_data = [] brown_data = brown.sents() gutenberg_data = gutenberg.sents() punctuations = [',', '.', ':', ';', '?', '"', '!', '--', '(', ')', '``'] punctuations.append("''") for sentence in brown_data: sentence.insert(0, '<s>') sentence.insert(0, '<s>') sentence.append('</s>') for i in sentence: if i in punctuations: sentence.remove(i) brown_text.append(sentence) for sentence in gutenberg_data: sentence.insert(0, '<s>') sentence.insert(0, '<s>') sentence.append('</s>') for i in sentence: if i in punctuations: sentence.remove(i) gutenberg_text.append(sentence)
def q1(): # 1. Print the number of word tokens # YOUR CODE from nltk.corpus import gutenberg as gb #if you want to print all file ids in gutenberg archive #print(gb.fileids()) file_id = 'austen-sense.txt' word_list = gb.words(file_id) print(len(word_list)) # 2. Print the number of word types # YOUR CODE print(len( set( [ w.lower() for w in word_list ]) )) # 3. Print all tokens in the first sentence # YOUR CODE sent_list = gb.sents(file_id) print(' '.join(sent_list[0])) # if you want to tokenize a string raw = 'i have a book.' from nltk import word_tokenize as wt word_list = wt(raw)
def getFreq(n): freq = {} length2 = {} for category in brown.categories(): sentences = brown.sents(categories=category) length2[category] = len(sentences) for sentence in sentences[:int(length2[category] * 0.9)]: text = " <s> " + ' '.join( re.compile(r'\w+').findall( ' '.join(sentence))).lower() + " </s> " model = myNGrams(text, n) for x in model: line = ' '.join(x) count = len(re.findall(" " + line + " ", text)) if (x not in freq) and (count <> 0): freq[x] = 0 if count <> 0: freq[x] += count length = {} for category in gutenberg.fileids(): sentences = gutenberg.sents(category) length[category] = len(sentences) for sentence in sentences[:int(length[category] * 0.9)]: text = " <s> " + ' '.join( re.compile(r'\w+').findall( ' '.join(sentence))).lower() + " </s> " model = myNGrams(text, n) for x in model: line = ' '.join(x) count = len(re.findall(" " + line + " ", text)) if (x not in freq) and (count <> 0): freq[x] = 0 if count <> 0: freq[x] += count return [freq, length]
def write_sentence(): """Step 2: Choose a work, identify the author, and choose a sentence.""" work = gutenberg.fileids()[np.random.randint(len(gutenberg.fileids()))] author = re.findall('(\w+)-', work)[0].title() sentences = gutenberg.sents(work) rndm_sentence = sentences[np.random.randint(len(sentences))] tagged_rndm_sentence = pos_tag(rndm_sentence) """Step 3: Replace every word in the sentence with another word that can have the same POS.""" new_sentence = [ tup[0] if tup[1] in ['DT', 'NNP', '.', ','] or tup[1] not in tagged_words_dict else tagged_words_dict[tup[1]][np.random.randint( len(tagged_words_dict[tup[1]]))] for tup in tagged_rndm_sentence ] new_detokenized_sentence = str( TreebankWordDetokenizer().detokenize(new_sentence)) new_detokenized_sentence = new_detokenized_sentence[0].upper( ) + new_detokenized_sentence[1:] if new_detokenized_sentence[-1].isalnum(): new_detokenized_sentence = new_detokenized_sentence + '.' #print(f"{author}:", tagged_rndm_sentence) #print(f"{author}:", new_detokenized_sentence) #for tag_s, new_s in zip(tagged_rndm_sentence, new_detokenized_sentence.split()): # print(tag_s, new_s) if len(new_sentence) <= 3: return write_sentence() if len(author) + len(new_detokenized_sentence) > 278: return write_sentence() else: return (author, re.sub('[\)]', '', new_detokenized_sentence))
def get_poem(): """ This function should extract hexametric sentences from Gutenberg texts, but it doesn't. Either hexametric sentences are too rare, or the absence of basic function words from CMUdict results in problems with the matching of the whole sentence. """ outtext = [] for corpus in gutenberg.fileids(): text = gutenberg.sents(corpus) for sentence in text: transcription = "" discard = False for word in sentence: if word.lower() in words: transcription += words[word.lower()] elif re.match(one_syllable, word.lower()): # consider this word a "small", unstressed word transcription += "A0A" else: discard = True if re.match(verse, transcription) and not discard: print(sentence, transcription) outtext.append(" ".join(sentence)) return "\n".join(outtext)
from nltk.corpus import gutenberg alice = gutenberg.sents('carroll-alice.txt') def count_word(): result = {} for sentence in alice: for word in sentence: normalized_word = word.lower() if normalized_word.isalpha(): result[normalized_word] = result.setdefault( normalized_word, 0) + 1 return result def count_first_word(): result = {} for sentence in alice: first_word = sentence[0] first_word = first_word.lower() if first_word.isalpha(): result[first_word] = result.setdefault(first_word, 0) + 1 return result def count(): wordf = count_word() firstf = count_first_word() words = sorted(wordf.keys())
def print_longest(): macbeth_sentences=gutenberg.sents('shakespeare-macbeth.txt') # print macbeth_sentences # print macbeth_sentences[1037] longest_len=max([len(s) for s in macbeth_sentences]) print [s for s in macbeth_sentences if len(s)==longest_len]
def makePowerMatrix(): matrix = [["uniquecount", "sentence length", "avg word length", "digit prop", "capital prop", "quotation", "question", "exclamation", "noun", "adj", "adv", "verb", "foreign", "preposition", "pronoun", "interjection","childW", "historyW","religionW","scienceW", "FILE", "CLASS"]] rowLength = len(matrix) i = 0 j = 0 for path in parse.paths: #parse.paths for file in glob.glob(path): row = [0.0] * 22 wholetext = parsetotext(file) textlist = parse.parse(file) textlistNL = parse.parse(file, False) # not lower case # textliststem = stemparse.parse(file) doclen = len(wholetext) wordcount = len(textlist) question = wholetext.count("?") exclamation = wholetext.count("!") quotations = (wholetext.count("'") + wholetext.count('"')) uniquecount = len(list(set(textlist))) num_words = len(gutenberg.words(file)) num_sents = len(gutenberg.sents(file)) if wordcount != 0: row[0] = uniquecount/(wordcount*1.0) lenmap = map(len, textlist) row[2] = sum(lenmap)/(wordcount*1.0) if num_sents != 0: row[1] = round(num_words/num_sents) if doclen != 0: row[3] = sum(c.isdigit() for c in wholetext)/(doclen*1.0) a = [x.isupper() for x in [y[0] for y in textlistNL]] row[4] = sum(a)/(doclen*1.0) row[5] = quotations / (wordcount * 1.0) row[6] = question / (wordcount * 1.0) row[7] = exclamation / (wordcount * 1.0) wholetext = unicode(wholetext, errors='replace') text = nltk.word_tokenize(wholetext) a = nltk.pos_tag(text) tag_fd = nltk.FreqDist(tag for (word, tag) in a) a = tag_fd.most_common() count = sum([y for (x, y) in a]) if count != 0: row[8] = sum([y for (x, y) in a if x in ["NN", "NNP", "NNPS", "NNS"]])/(count*1.0) #Noun NN NNP NNPS NNS row[9] = sum([y for (x, y) in a if x in ["JJ", "JJR", "JJS"]])/(count*1.0) #Adj JJ JJR JJS row[10] = sum([y for (x, y) in a if x in ["RB", "RBR", "RBS", "WRB"]])/(count*1.0) #Adv RB RBR RBS WRB row[11] = sum([y for (x, y) in a if x in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]])/(count*1.0) #Verb VB VBD VBG VBN VBP VBZ row[12] = sum([y for (x, y) in a if x in ["FW"]])/(count*1.0) #foreign row[13] = sum([y for (x, y) in a if x in ["IN"]])/(count*1.0) #prepo row[14] = sum([y for (x, y) in a if x in ["PRP", "PRP$", "WP", "WP$"]])/(count*1.0) #pronoun row[15] = sum([y for (x, y) in a if x in ["UH"]])/(count*1.0) #interjection if wordcount != 0: row[16] = len([y for y in textlist if y in childW])/(wordcount*1.0) row[17] = len([y for y in textlist if y in historyW])/(wordcount*1.0) row[18] = len([y for y in textlist if y in religionW])/(wordcount*1.0) row[19] = len([y for y in textlist if y in scienceW])/(wordcount*1.0) # row[20] = len([y for y in textliststem if y in childWS])/(wordcount*1.0) # row[21] = len([y for y in textliststem if y in historyWS])/(wordcount*1.0) # row[22] = len([y for y in textliststem if y in religionWS])/(wordcount*1.0) # row[23] = len([y for y in textliststem if y in scienceWS])/(wordcount*1.0) row[-1] = j # This number assigns class row[-2] = re.search('[0-9]+\.txt', file).group() # Extracts file name (Ex: "123.txt") matrix += [row] i += 1 print(path[-20:] + " on iteration " + str(i)) j += 1 return matrix
from nltk.corpus import gutenberg gutenberg.fileids() #print( gutenberg.fileids() ) emma = gutenberg.words('austen-emma.txt') print(len(emma)) ''' This program displays three statistics for each text: average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score). ''' for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print (int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab),fileid ) #Returns a List of sentences macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') print( macbeth_sentences ) print( macbeth_sentences[1037] ) #Return the max len of sentences longest_len = max([len(s) for s in macbeth_sentences]) #Save the sentences biggest longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]
from nltk.probability import (FreqDist, ConditionalProbDist, ConditionalFreqDist, LidstoneProbDist) from nltk.util import ngrams from nltk.model.api import ModelI from nltk.model.ngram import NgramModel from random import randint from nltk.tokenize import word_tokenize import numpy as np est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) noOfFiles = 3 fileids = ['bryant-stories.txt', 'carroll-alice.txt', 'shakespeare-hamlet.txt'] Authors = ['Bryant', 'Carroll', 'Shakespeare'] lenFirstSent = [ len(gutenberg.sents(fileids[i])[0]) - 1 for i in range(noOfFiles) ] C = [gutenberg.words(fileids[i])[lenFirstSent[i]:] for i in range(noOfFiles)] lenC = [len(C[i]) for i in range(noOfFiles)] unigram = [NgramModel(1, C[i], estimator=est) for i in range(noOfFiles)] bigram = [ NgramModel(2, C[i], True, True, estimator=est) for i in range(noOfFiles) ] trigram = [ NgramModel(3, C[i], True, True, estimator=est) for i in range(noOfFiles) ] def generateText(model, train): pos = []
def _nltk_prep_gutenberg(self, gutenbergF: str): try: gutenberg.sents() except LookupError as _le: nltk.download('gutenberg') return gutenberg
for word in ['Call', 'me', 'Ishmael', '.']: print word #获取语料库 nltk.corpus.gutenberg.fileids() emma = nltk.corpus.gutenberg.words('austen-emma.txt') emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt')) emma.concordance("surprize") from nltk.corpus import gutenberg gutenberg.fileids() for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid #句子划分 macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') longest_len = max([len(s) for s in macbeth_sentences]) #网络聊天语料库 from nltk.corpus import webtext from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brown brown.categories() brown.sents(categories=['news', 'editorial', 'reviews']) news_text = brown.words(categories='news')
import nltk from nltk.corpus import gutenberg from nltk.corpus import brown print(gutenberg.fileids()) print(len(gutenberg.raw('austen-emma.txt'))) macbeth = gutenberg.sents('shakespeare-macbeth.txt') print(macbeth[1:5]) print(brown.categories()) print(brown.words(categories='news')) news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) print(fdist) cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) print(cfd) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals)
from nltk.corpus import gutenberg from nltk.util import ngrams from kneser_ney import KneserNeyLM gut_ngrams = (ngram for sent in gutenberg.sents() for ngram in ngrams( sent, 3, pad_left=True, pad_right=True, pad_symbol='<s>')) lm = KneserNeyLM(3, gut_ngrams, end_pad_symbol='<s>') print(lm.score_sent(('This', 'is', 'a', 'sample', 'sentence', '.')))
import nltk from nltk.corpus import gutenberg print nltk.corpus.gutenberg.fileids() print gutenberg.fileids() print gutenberg.words('austen-sense.txt') for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set(w.lower() for w in gutenberg.words(fileid))) print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)
def __init__(self, iterable): assert len(iterable) == 2 tuple.__init__(self, iterable) self.label = iterable[0] self.child = iterable[1] def __new__(cls, *args, **kwargs): assert len(args) == 1 assert len(args[0]) == 2 return tuple.__new__(cls, args[0]) if __name__ == '__main__': import nltk from nltk.corpus import gutenberg import parser def get_deps(s): return parser.cp.parse_trees(s, transform=parser.to_deps) G = FreqGraph() alice_sents = gutenberg.sents(fileids='carroll-alice.txt') for sent in alice_sents: dep = get_deps(' '.join(sent)) try: term = logic.Term(dep.next()) G.clique(term) except: pass #G.ingest(get_deps('Barack is president'))
train_set = [] dev_set = [] test_set = [] for c in brown.categories(): temp = brown.fileids(c) temp_length = len(temp) train_set += temp[:int(np.ceil(0.6 * temp_length))] dev_set += temp[int(np.ceil(0.6 * temp_length)):int(np.ceil(0.8 * temp_length))] test_set += temp[int(np.ceil(0.8 * temp_length)):] #test_set += temp[-1:] # In[3]: brown_sent_train = brown.sents(train_set) + gb.sents(gb_train_set) brown_words_train = brown.words(train_set) + gb.words(gb_train_set) brown_words_train = list( filter( lambda a: a not in ("``", "''", "--", ".", ",", "!", ";", "(", ")", "?", ":"), brown_words_train)) brown_words_train = [x.lower() for x in brown_words_train] brown_words_train += ['<s>', '</s>'] * len(brown_sent_train) brown_unigram_dict_train1 = FreqDist(brown_words_train) brown_unigram_dict_train = copy.deepcopy(brown_unigram_dict_train1) c = 0 for (k, v) in brown_unigram_dict_train1.items(): if (v <= 3): c += 1 brown_unigram_dict_train['<unk>'] = brown_unigram_dict_train.pop(k)
text = remove_stopwords(text) normalized_corpus.append(text) if tokenize: text = tokenize_text(text) normalized_corpus.append(text) return normalized_corpus # NOrmalize the data from nltk.corpus import gutenberg from string import punctuation from keras.preprocessing import text from keras.utils import np_utils from keras.preprocessing import sequence bible = gutenberg.sents('bible-kjv.txt') remove_terms = punctuation + '0123456789' norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible] norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible] norm_bible = filter(None, normalize_corpus(norm_bible)) norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2] print('Total lines:', len(bible)) print('\nSample line:', bible[10]) print('\nProcessed line:', norm_bible[10]) tokenizer = text.Tokenizer() tokenizer.fit_on_texts(norm_bible)
from nltk.corpus import gutenberg for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
# -*- coding: utf-8 -*- """ """ from nltk.corpus import gutenberg from normalization import normalize_corpus import nltk from operator import itemgetter alice = gutenberg.sents(fileids='carroll-alice.txt') alice = [' '.join(ts) for ts in alice] norm_alice = filter(None, normalize_corpus(alice, lemmatize=False)) # print first line print norm_alice[0] def flatten_corpus(corpus): return ' '.join([document.strip() for document in corpus]) def compute_ngrams(sequence, n): return zip(*[sequence[index:] for index in range(n)]) def get_top_ngrams(corpus, ngram_val=1, limit=5): corpus = flatten_corpus(corpus) tokens = nltk.word_tokenize(corpus)
pickle_in = open("gutenbergBrownUnigrams.pickle", "rb") Unigrams = pickle.load(pickle_in) pickle_in = open("gutenbergBrownBigrams.pickle", "rb") Bigrams = pickle.load(pickle_in) pickle_in = open("gutenbergBrownTrigrams.pickle", "rb") Trigrams = pickle.load(pickle_in) print("Pickle in time:" + str(time.time() - start)) # # TEST MODULE - outputs perplexity # # start = time.time() totPerplexity = 0 iterCount = 0 for sent in gutenberg.sents()[78842:]: probOfSent = 0 sent.insert(0, "<s>") sent.append("<\s>") prev_word2 = sent[0] word = sent[1] try: probOfSent += Bigrams[prev_word2 + " " + word] except KeyError: probOfSent += -6 prev_word1 = sent[0] prev_word2 = sent[1] for word in sent[2:]: try: trigram = prev_word1 + " " + prev_word2 + " " + word if trigram in Trigrams:
import nltk from nltk.corpus import gutenberg fileids = gutenberg.fileids() # print 'fileids: ', fileids emma = gutenberg.words('austen-emma.txt') # average characters in a word: raw/words # average word in a sentence: words/sents # lexical diversity - num_words/num_vocab # for fileid in fileids: # num_chars = len(gutenberg.raw(fileid)) # num_words = len(gutenberg.words(fileid)) # num_sents = len(gutenberg.sents(fileid)) # num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) # print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid macbeth_sents = gutenberg.sents('shakespeare-macbeth.txt') longest_len = max([len(s) for s in macbeth_sents]) longest_sent = [s for s in macbeth_sents if len(s) == longest_len] print 'longest_sent: ', longest_sent
def pmi_with_cython(input_corpus): logging.debug(msg='With cython is True') start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True) elapsed_time = time.time() - start print(("elapsed_time with cython:{} [sec]".format(elapsed_time))) from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import genesis from nltk.corpus import abc abs_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() input_corpus = { 'abs': list(abs_corpus), 'genesis': list(genesis_corpus), 'web': list(web_corpus), 'gutenberg': list(gutenberg_corpus) } pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus)
def analyze(book_name): # This function analyzes the 'book_name' file and prints out its characteristics using nltk package # Extracting characters, words and sentences respectively below characters = g.raw(book_name) words = g.words(book_name) sentences = g.sents(book_name) max_length_word = words[0] max_length_sentence = sentences[0] max_length_sentence_word_count = len(max_length_sentence) vocabulary = list() stem_families = dict() stemmer = PorterStemmer() for word in words: # Checking for the longest word if len(word) > len(max_length_word): max_length_word = word stemmed_word = stemmer.stem(word) # Creating a vocabulary of stemmed words and a dictionary for stem families if stemmed_word not in vocabulary and stemmed_word.isalpha(): vocabulary.append(stemmed_word) stem_families[stemmed_word] = list() stem_families[stemmed_word].append(word.lower()) elif stemmed_word in vocabulary and word.lower( ) not in stem_families[stemmed_word]: stem_families[stemmed_word].append(word.lower()) for sentence in sentences: # Checking for the longest sentence if len(sentence) > len(max_length_sentence): max_length_sentence = sentence max_length_sentence_word_count = len(max_length_sentence) # Converting that largest sentence from a list of words to a cumulative string sentence max_length_sentence_string = " " max_length_sentence_string = max_length_sentence_string.join( max_length_sentence) max_stem_family = list(list(stem_families.items())[0]) for key, value in stem_families.items(): # Checking for the largest stem family if len(value) > len(max_stem_family[1]): max_stem_family[0] = key max_stem_family[1] = list(value) # Printing the characteristics as requested print("Analysis of '%s'" % book_name) print("# chars =", len(characters)) print("# words =", len(words)) print("# sentences =", len(sentences)) print("Longest word = '%s'" % max_length_word) print("Longest sentence = '%s' (%d words)" % (max_length_sentence_string, max_length_sentence_word_count)) print("Vocab size =", len(vocabulary)) print("Largest stem family '%s' : {" % max_stem_family[0], end=" ") for i in range(len(max_stem_family[1])): if i != 0: print(",", end=" ") print("'%s'" % max_stem_family[1][i], end=" ") print("}")
import nltk import numpy as np from nltk.corpus import gutenberg import pickle def save_object(obj, filename): with open(filename, "w") as output: pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) sents = gutenberg.sents("blake-poems.txt") table = [] for i in range(20): table.append([]) for s in sents[1:]: # TODO prevent ?!, => . and dont count . in a sentence to length if len(s) > 2 and len(s) < 20: tags = nltk.pos_tag(s) simpleTags = [(word, nltk.map_tag("en-ptb", "universal", tag)) for word, tag in tags] tagsOnly = [t[1] for t in simpleTags] # this is to filter out headlines if tagsOnly[len(tagsOnly) - 1] == ".": wordCount = len(tagsOnly) - tagsOnly.count(".") table[wordCount].append(tagsOnly) save_object(table, "grammar")
import nltk from spacy.en import English from nltk.corpus import gutenberg sentences = [] for fileid in gutenberg.fileids(): txt = gutenberg.sents(fileid) for sent_array in txt: sentences.append(' '.join(sent_array)) break doc = ' '.join(sentences) nlp = English() doc_processed = nlp(doc) for sent in doc: for token in sent: if token.is_alpha: print(token.orth +","+ token.tag_ + ","+ token.head.lemma_)
#!/usr/bin/python """Just a testing program for NLTK library. It is a NLP library for Python. Some kick-ass library this is. :) Pre-Requisites: NLTK Library installed, And Download additional data for the library using it's command. You can use "Natural Language Processing with Python" book from O'Reilley Publications for further details. This program prints some statistics for the Corpus(a large compiled collection of text files). """ import nltk from nltk.corpus import gutenberg for fid in gutenberg.fileids(): nchars=gutenberg.raw(fid) nwords=gutenberg.words(fid) nsents=gutenberg.sents(fid) nvocab=len(set(w.lower() for w in gutenberg.words(fid)) print "%s %s %s %s",(% str(int(nchars/nwords)), % str(int(nwords/nsents)), % str(int(nwords/nvocab)), % fid),
if a in result: result[a] += freq else: result[a] = freq result = sorted(result.items(), key=operator.itemgetter(1), reverse=True) return result if __name__ == "__main__": # dataset = load_data('mycorpus.txt') # edited_data = edit_data(dataset) # dataset = gutenberg.sents('carroll-alice.txt') # dataset = gutenberg.sents('milton-paradise.txt') # dataset = gutenberg.sents('bible-kjv.txt') nltk.download('gutenberg') dataset = gutenberg.sents(gutenberg.fileids()) #for fileid in gutenberg.fileids(): # dataset += gutenberg.sents(fileid) edited_data = edit_data(dataset) avg = 0 for doc in edited_data: avg += len(doc) avg = avg / len(edited_data) # print('Number of documents: ' + str(len(edited_data))) # print('Average length of document: '+ str(avg)) dictionary = make_dict(edited_data) corpus = [dictionary.doc2bow(text) for text in edited_data] # print('20 most common words of corpus:') freq_data = get_frequency(corpus, dictionary) f = 0 for doc, freq in freq_data:
from __future__ import print_function from nltk.corpus import gutenberg if __name__ == '__main__': # Print all Gutenberg corpus documents print('Gutenberg corpus files:') print(gutenberg.fileids()) # Print a raw corpus print(gutenberg.raw('milton-paradise.txt')) # Print 2 sentences from a corpus print(gutenberg.sents('milton-paradise.txt')[0:2]) # Print 20 words from a corpus print(gutenberg.words('milton-paradise.txt')[0:20])
from nltk.corpus import gutenberg from gensim.models import word2vec from string import punctuation bible_kjv_words = gutenberg.words('bible-kjv.txt') bible_kjv_sents = gutenberg.sents('bible-kjv.txt') discard_punctuation_and_lowercased_sents = [[word.lower() for word in sent if word not in punctuation] for sent in bible_kjv_sents] bible_kjv_word2vec_model = word2vec.Word2Vec(discard_punctuation_and_lowercased_sents, min_count=5, size=200) bible_kjv_word2vec_model.save("bible_word2vec_gensim") bible_kjv_word2vec_model.wv.save_word2vec_format("bible_word2vec_org", "bible_word2vec_vocabulary") print(bible_kjv_word2vec_model.most_similar(["god"])) print(bible_kjv_word2vec_model.most_similar(["apple"]))
from nltk.corpus import gutenberg from nltk.text import Text from chatbot.markovgenerator import markovmodeller if __name__ == "__main__": my_essay = True if my_essay: txt = "" with open('res/ans.txt') as text: for line in text: txt += line model = markovmodeller.build_markov_model_from_string(txt) else: list_sentences = Text(gutenberg.sents('austen-sense.txt')) model = markovmodeller.build_markov_model_from_list_of_sentences( list_sentences) while (True): walk_string = markovmodeller.get_walk(model) print(walk_string) x = input("--- type X to stop") if x == "X": break
for i in range(len(words_te)-2): x=(words_te[i],words_te[i+1],words_te[i+2]) if(Interpolated_Kneser_Ney_dict.get(x,"empty")=="empty"): if((x[0],x[1]) not in bgcounter): Interpolated_Kneser_Ney_dict[x]=findPKn_bigram((x[1],x[2])) else: Interpolated_Kneser_Ney_dict[x]=findPKn_trigram(x,discount_final) perp=perp*((1/Interpolated_Kneser_Ney_dict[x])**(1/N)) return perp # In[30]: text_gutenberg=list(gutenberg.sents()) text_brown=list(brown.sents()) text_gutenberg_size=len(text_gutenberg) text_gutenberg_size=len(text_gutenberg) text_brown_size=len(text_brown) for i in range(text_gutenberg_size): text_gutenberg[i].insert(0,"<s>") text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>') text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>') for i in range(text_brown_size): text_brown[i].insert(0,"<s>") text_brown[i].insert(len(text_brown[i]),'<e>') text_brown[i].insert(len(text_brown[i]),'<e>') text_gutenberg_tr,text_gutenberg_te=train_test_split(text_gutenberg,test_size=.20,random_state=4) text_brown_tr,text_brown_te=train_test_split(text_brown,test_size=.20,random_state=4)
from nltk.corpus import gutenberg from nltk.util import ngrams from kneser_ney import KneserNeyLM gut_ngrams = ( ngram for sent in gutenberg.sents() for ngram in ngrams(sent, 3, pad_left=True, pad_right=True, pad_symbol='<s>')) lm = KneserNeyLM(3, gut_ngrams, end_pad_symbol='<s>') print(lm.score_sent(('This', 'is', 'a', 'sample', 'sentence', '.')))
return ngram[-1] def highest_order_probs(self): return self.lm[0] def generate_sentence(self, min_length=4): """ Generate a sentence using the probabilities in the language model min_length: int, the minimum number of words in the sentence. """ sent = [] probs = self.highest_order_probs() while len(sent) < min_length + self.highest_order: sent = [self.start_pad_symbol]*(self.highest_order-1) sent.append(self.generate_next_word(sent,probs)) while sent[-1] != self.end_pad_symbol: sent.append(self.generate_next_word(sent,probs)) sent = " ".join(sent[(self.highest_order-1):-1]) return sent ## how to test it from nltk.corpus import gutenberg from nltk.util import ngrams from ModifiedKneserNeyLM import ModifiedKneserNeyLM gut_ngrams = (ngram for sent in gutenberg.sents() for ngram in ngrams(sent, 3, pad_left=True, pad_right=True, pad_symbol='<s>')) lm = ModifiedKneserNeyLM(3, gut_ngrams, end_pad_symbol='<s>') print(lm.score_sent(('This','is','a','sample','sentence','.')))
if (len(text) != 10): return False, s else: return True, s if __name__ == '__main__': time.clock() print() brown_corpus = list(brown.sents(brown.fileids())) for i in range(len(brown_corpus)): brown_corpus[i] = list(map(lambda x: x.lower(), brown_corpus[i])) gutenberg_corpus = list(gutenberg.sents(gutenberg.fileids())) for i in range(len(gutenberg_corpus)): gutenberg_corpus[i] = list( map(lambda x: x.lower(), gutenberg_corpus[i])) combined_corpus = brown_corpus + gutenberg_corpus unigram_list, bigram_list = training(combined_corpus) i = 0 while (i < 1): bool, s = generate_trigram_token(bigram_list) if (bool): i += 1 print(s) print() print('Total time taken', str(time.clock()))
# Module 5: Word Embedding # Gutenberg Word2Vec # Author: Dr. Alfred from gensim.models import Word2Vec from nltk.corpus import gutenberg embedding = Word2Vec(gutenberg.sents(), min_count=1, window=5, size=32) print(embedding['man']) print(embedding.most_similar('man', topn=5)) print(embedding.most_similar('woman', topn=5)) print(embedding.most_similar(positive=['woman', 'king'], negative=['man']))
def fun03(): """fun03""" macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') print macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) print [s for s in macbeth_sentences if len(s) == longest_len]
for word in wordlist: try: for thing in wikipedia.search(word): #print("SEARCH TERM: "+thing) #print(wikipedia.page(thing)) #print(wikipedia.page(thing).content) pages += wikipedia.page(thing).content except wikipedia.DisambiguationError as e: s = random.choice(e.options) get_wiki(s) pass except: pass b = brown.sents() sents = tokenizer.tokenize(pages) sense = gutenberg.sents('austen-sense.txt') emma = gutenberg.sents('austen-emma.txt') persuasion = gutenberg.sents('austen-persuasion.txt') bible = genesis.sents('english-kjv.txt') blake = gutenberg.sents('blake-poems.txt') bryant = gutenberg.sents('bryant-stories.txt') burgess = gutenberg.sents('burgess-busterbrown.txt') carroll = gutenberg.sents('carroll-alice.txt') ch_ball = gutenberg.sents('chesterton-ball.txt') ch_brown = gutenberg.sents('chesterton-brown.txt') ch_thurs = gutenberg.sents('chesterton-thursday.txt') edge = gutenberg.sents('edgeworth-parents.txt') mel = gutenberg.sents('melville-moby_dick.txt') mil = gutenberg.sents('milton-paradise.txt') caesar = gutenberg.sents('shakespeare-caesar.txt') hamlet = gutenberg.sents('shakespeare-hamlet.txt')
nltk.corpus.gutenberg.fileids() emma = nltk.corpus.gutenberg.words('austen-emma.txt') len(emma) emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt')) emma.concordance('surprize') #another way to do this from nltk.corpus import gutenberg gutenberg.fileids() emma = gutenberg.words('austen-emma.txt') for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') macbeth_sentences #load sentences of Macbeth macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) [s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence from nltk.corpus import webtext for fileid in webtext.fileids(): print (fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml')
# pass # else: # ssl._create_default_https_context = _create_unverified_https_context # #library # nltk.download('gutenberg') # #puctionation and tokenizer # nltk.download('punkt') from nltk.corpus import gutenberg print(gutenberg.fileids()) gberg_sents = gutenberg.sents(fileids=[ 'bible-kjv.txt', 'austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'carroll-alice.txt' ]) #WORDCOUNT print( len( gutenberg.sents(fileids=[ 'bible-kjv.txt', 'austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'carroll-alice.txt' ]))) model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5,
from nltk.corpus import gutenberg; from pynput.keyboard import Key, Listener; import random; def on_release(key): print(key); if key == 'q': exit(); elif key == Key.enter: print('next'); fileCount = len(gutenberg.fileids()); fileName = gutenberg.fileids(); fileName = gutenberg.sents(random.choice(fileName)); print(' '.join(fileName[0]),"\n"); while 1: sentence = random.randrange(0,len(fileName)); sentence = ' '.join(fileName[sentence]); print(sentence); with Listener( on_release=on_release) as listener: listener.join()