def makeModel(): #sentences = webtext.raw()+brown.raw()+reuters.raw() sentences = webtext.raw() + reuters.raw() # Tokenize the sentences try: # Use the default NLTK tokenizer. from nltk import word_tokenize, sent_tokenize # Testing whether it works. # Sometimes it doesn't work on some machines because of setup issues. word_tokenize( sent_tokenize("This is a foobar sentence. Yes it is.")[0]) except: # Use a naive sentence tokenizer and toktok. import re from nltk.tokenize import ToktokTokenizer # See https://stackoverflow.com/a/25736515/610569 sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x) # Use the toktok tokenizer that requires no dependencies. toktok = ToktokTokenizer() word_tokenize = word_tokenize = toktok.tokenize tokenized_text = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(sentences) ] # Make it ready for making 3 grams n = 5 train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text) model = MLE(n) # Lets train a 3-grams model, previously we set n=3 model.fit(train_data, padded_sents) #print(model.vocab) return model
def calculate_normal_word_freq(): normal_word_freq = Counter() [ normal_word_freq.update( WordCloud.parse_tweet(text.split(":", maxsplit=1)[-1])) for text in webtext.raw("overheard.txt").split("\n") ] return normal_word_freq
def tokenize_example(): singles = webtext.raw('singles.txt') singles_no_8 = singles.split('\n')[8] print('[singles] Line:8 - {}'.format(singles_no_8)) print('\n-----\n'.join(sent_tokenize(singles_no_8))) print('Word tokenizer') for i, sent in enumerate(sent_tokenize(singles_no_8)): print('{}: {}'.format(i, word_tokenize(sent)))
def stopwords_example(): singles = webtext.raw('singles.txt') singles_no_8 = singles.split('\n')[8] stopwords_en = stopwords.words('english') single_no8_tokenized_lowered = list( map(str.lower, word_tokenize(singles_no_8))) stopwords_en = set(stopwords.words('english')) stopwords_en_withpunct = stopwords_en.union(set(punctuation)) print([ word for word in single_no8_tokenized_lowered if word not in stopwords_en_withpunct ])
def tagMessage(self, print_tag=0): text = webtext.raw('overheard.txt') temp = PunktSentenceTokenizer(text) message = temp.tokenize(self.message) for w in message: words = nltk.word_tokenize(w) tagged = nltk.pos_tag(words) if print_tag: print("Message is: ", self.message) print("Tagged message is: ", tagged) return tagged
def tokenize(corpus, fileID): ''' Tokenizes the, casting all words to lower case, stripping out punctuation marks, spaces, and words not made of one or more alphanumerical characters. Parameters ---------- corpus: An NLTK corpus fileID: A string Returns ------- words: a list of strings ''' #Use regex to remove punctuation marks and spaces. Only returns words with >= 1 alphanumerical characters pattern = re.compile(r'[^\w\s]') text=webtext.raw(fileID) words = [word.lower() for word in nltk.word_tokenize(re.sub(pattern, ' ', text))] return words
def webtext(): from nltk.corpus import webtext as webtext from nltk.corpus import nps_chat # list comprehension version file_ids = [fileid for fileid in webtext.fileids()] chat_file_ids = [fileid for fileid in nps_chat.fileids()] pirates = webtext.raw('pirates.txt') pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) lexical_diversity = lexical_div(uniqs, pirates_words) # import nltk.book as book # text1 = book.text1 # pirates = webtext.raw('pirates.txt') return render_template('webtext.html', file_ids=file_ids, chat_file_ids=chat_file_ids, pirates=pirates)
from nltk.corpus import gutenberg gutenberg.fileids() emma=gutenberg.words('austen-emma.txt') #loop over the text to get information for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print (int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid) #webtext in nltk.corpus from nltk.corpus import webtext for filleid in webtext.fileids(): print (fileid, webtext.raw(fileid)[:2]) from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] #brown corpus from nltk.corpus import brown brown.categories() brown.words(categories='editorial') brown.words(fileids=['cp12']) brown.sents(categories=['news','editorials']) edi_text = brown.words(categories='fiction') fdist=nltk.FreqDist([w.lower() for w in edi_text]) modals=['what','who','where','when','why']
for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') macbeth_sentences #load sentences of Macbeth macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) [s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence from nltk.corpus import webtext for fileid in webtext.fileids(): print (fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brown brown.categories() brown.words(categories = 'news') brown.words(fileids = ['cg22']) from nltk.corpus import brown news_text = brown.words(categories = 'news') fdist = nltk.FreqDist([w.lower() for w in news_text]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals:
def fun04(): """fun04""" for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:50]
def webtext(): for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65], '...'
import nltk from nltk.corpus import webtext from nltk.corpus import nps_chat #emma = gutenberg.words('austen-emma.txt') #print(len(set(w.lower() for w in emma))) for file in webtext.fileids(): print(file, webtext.raw(file[:65])) chatroom = nps_chat.posts()
nSents = len(gutenberg.sents(fileid)) nVocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(nChars/nWords), int(nWords/nSents), int(nWords/nVocab), fileid macbethRaw = gutenberg.raw('shakespeare-macbeth.txt') macbethWords = gutenberg.words('shakespeare-macbeth.txt') macbethSents = gutenberg.sents('shakespeare-macbeth.txt') longestLen = max([len(s) for s in macbethSents]) longestSents = [s for s in macbethSents if len(s) == longestLen] from nltk.corpus import webtext webtext.fileids() for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65], '...' webtext.raw('pirates.txt').lower().count('jack') pirates = nltk.Text(webtext.words('pirates.txt')) from nltk.corpus import brown brown.categories() brown.words(categories = 'news') brown.words(fileids = ['cg22']) brown.words(fileids = ['cg22','ca16']) # Concatenates the two corpora into one. from nltk.corpus import brown newsText = brown.words(categories = 'news') fdist = nltk.FreqDist([w.lower() for w in newsText]) modals = ['can','could','may','might','must','will']
def __init__(self): self.number_id = 28 self.source_id = "webtext" self.titles = [name for name in webtext.fileids()] self.data = [webtext.raw(name) for name in self.titles]
nVocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(nChars / nWords), int(nWords / nSents), int(nWords / nVocab), fileid macbethRaw = gutenberg.raw('shakespeare-macbeth.txt') macbethWords = gutenberg.words('shakespeare-macbeth.txt') macbethSents = gutenberg.sents('shakespeare-macbeth.txt') longestLen = max([len(s) for s in macbethSents]) longestSents = [s for s in macbethSents if len(s) == longestLen] from nltk.corpus import webtext webtext.fileids() for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65], '...' webtext.raw('pirates.txt').lower().count('jack') pirates = nltk.Text(webtext.words('pirates.txt')) from nltk.corpus import brown brown.categories() brown.words(categories='news') brown.words(fileids=['cg22']) brown.words(fileids=['cg22', 'ca16']) # Concatenates the two corpora into one. from nltk.corpus import brown newsText = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in newsText])
''' Tokenization What: Separate text into units such as sentences or words Why: Gives structure to previously unstructured text Notes: Relatively easy with English language text, not easy with some languages ''' # "corpus" = collection of documents # "corpora" = plural form of corpus from nltk.corpus import webtext webtext.fileids() # wine reviews corpus text = webtext.raw('wine.txt') text[:500] # tokenize into sentences sentences = [sent for sent in nltk.sent_tokenize(text)] sentences[:10] # tokenize into words tokens = [word for word in nltk.word_tokenize(text)] tokens[:100] # only keep tokens that start with a letter (using regular expressions) import re clean_tokens = [token for token in tokens if re.search(r'^[a-zA-Z]+', token)] clean_tokens[:100]
import nltk from nltk.corpus import webtext from nltk.corpus import nps_chat from nltk.corpus import brown # for fileid in webtext.fileids(): # print fileid, webtext.raw(fileid)[:65] # for fileId in nps_chat.fileids(): # print fileId pirates = webtext.raw('pirates.txt') pirates_char = len(webtext.raw('pirates.txt')) pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) print 'pirates_char: ', pirates_char, 'pirates_words: ', pirates_words, 'pirates_sents: ', pirates_sents, 'avg char per word: ', int(pirates_char/pirates_words), 'avg words per sentence: ', int(pirates_words/pirates_sents) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) def lexical_div(un, total): return total/un print 'lexical diversity: ', lexical_div(uniqs, pirates_words) # brown_categories = brown.categories() # for genre in brown_categories: # print genre news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) # modal verbs
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import brown from nltk.corpus import webtext brown.raw(fileids=["cm02"]) webtext.raw("firefox.txt")
) # output: ['Hello World.', "It's good to see you.", 'Thanks for buying this book.'] #### WORD TOKENIZE ###### sent = 'Hello World.' print(word_tokenize(sent)) # output: ['Hello', 'World', '.'] #### ALTERNATIVE WORD TOKENIZER ###### para_1 = "Can't is a contraction." tokenizer = WordPunctTokenizer() print(tokenizer.tokenize( para_1)) # output: ['Can', "'", 't', 'is', 'a', 'contraction', '.'] #### REGULAR EXPRESSION TOKENIZER ###### regex = "Can't is a contraction." tokenizer = RegexpTokenizer("[\w']+") print(tokenizer.tokenize(regex)) # output: ["Can't", 'is', 'a', 'contraction'] #### TRAINING A SENTENCE TOKENIZER ###### text = webtext.raw('overheard.txt') # Read text example sent_tokenizer = PunktSentenceTokenizer(text) # Train tokenizer on text sents_tokenizer_1 = sent_tokenizer.tokenize(text) # Use new tokenizer sents_tokenizer_2 = sent_tokenize(text) # Old tokenizer #### FILTERING STOPWORDS ###### english_stops = set( stopwords.words('english')) #set english languagge and load stopwords words = ["Can't", 'is', 'a', 'contraction'] print([word for word in words if word not in english_stops]) # output: ["Can't", 'contraction']
from functools import reduce import operator import string from nltk.corpus import gutenberg num_of_words_to_plot = 20 num_of_words_compare = 50 file_path = "/home/helena/Documents/NLP/data/study_in_scarlet.txt" moby_file_name = 'melville-moby_dick.txt' # Read file file = open(file_path, 'r') raw_text = file.read() # Word and sentence tokenization tokenized_sentences = sent_tokenize(webtext.raw(file_path)) #tokenized_words = reduce(operator.concat, [word_tokenize(s) for s in tokenized_sentences]) tokenizer = RegexpTokenizer(r'\w+') stop = stopwords.words('english') + list(string.punctuation) raw_tokens = tokenizer.tokenize(webtext.raw(file_path).lower()) tokens = [i for i in raw_tokens if i not in stop] # Convert to nltk text text = Text(tokens) # Freq dist fdist = FreqDist(text) fdist.plot(num_of_words_to_plot, cumulative = False) scarlet_commons = [word for word, counts in fdist.most_common(num_of_words_compare)]
'freq_tokens_top15': [], 'freq_bigrams': None, 'freq_bigrams_top15': [], 'freq_quadrigrams_life': [] } } # Gera os stopwords e inclui palavras personalizadas stopwords = stopwords.words('english') + [ "[", "]", ".", ",", "?", "*", ":", "...", "!", "'", "'s", "#", "(", ")", "'m", "-", "'ve", "ft.", "n't", "y.o", "&", "..", "n/s", "s/d", "n/d", "s/s", "s/e", "''" ] for file in data: text = webtext.raw(file) # Gera e filtra os tokens de cada arquivo data[file]['tokens'] = tokenize.word_tokenize(text) data[file]['tokens'] = [ t.lower() for t in data[file]['tokens'] if t.lower() not in stopwords ] # Gera os dados de frequência dos tokens data[file]['freq_tokens'] = nltk.FreqDist(data[file]['tokens']) # Gera os dados dos 15 tokens mais frequentes top15 = data[file]['freq_tokens'].most_common(15) data[file]['freq_tokens_top15'] = top15 # Gera os dados de frequência dos bigramas
word_sim = {} for i in range(self.unique_word): vet_w2 = self.w1[i] theta_sum = np.dot(vet_w1, vet_w2) theta_den = np.linalg.norm(vet_w1) + np.linalg.norm(vet_w2) theta = theta_sum / theta_den word = self.index_word[i] word_sim[word] = theta sort_word = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True) for word, sim in sort_word[:top_n]: print(word, sim) fx = webtext.raw(webtext.fileids()[0]) corpus = fx[:1000] print(corpus) settings = {"train": {"window": 2, "epoch": 3000, "lr": 0.01}} w2 = word2vec(settings) pre_pr = w2.pre_process(corpus, ispara=False) # print(corpus) training_Data = w2.gen_training_data(pre_pr) w2.train(training_Data) t_word = "phoenix" print(w2.word_vec(t_word)) w2.vec_sim(t_word, 5) # print(training_Data.size*training_Data.itemsize)
from nltk.corpus import webtext for file_id in webtext.fileids(): print file_id print webtext.raw(file_id)[:100] print
import nltk from nltk.corpus import webtext from nltk.corpus import nps_chat print('WEBTEXT___') for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:20]) print('NPS_CHAT___') for post in nps_chat.posts(): print(post)
from nltk.corpus import webtext article = """Girl: But you already have a Big Mac... Hobo: Oh, this is all theatrical. Girl: Hola amigo... Hobo: his is all theatrical. 我说: "U.S.A 你好啊". U.S.A is the abbreviation of United States. To use statistical parameters such as mean and standard deviation reliably, you need to have a good estimator for them. The maximum likelihood estimates (MLEs) provide one such estimator. However, an MLE might be biased, which means that its expected value of the parameter might not equal the parameter being estimated.""" sentences = sent_tokenize(article) for sentence in sentences: tokens = word_tokenize(sentence) #print(sentence) text = webtext.raw('overheard.txt') print(text) sent_tokenizer = PunktSentenceTokenizer(text) sents1 = sent_tokenizer.tokenize(text) sents2 = sent_tokenize(text) sents1_article = sent_tokenizer.tokenize(article) sents2_article = sent_tokenize(article) print(sents1[0]) print(sents2[0]) print() print(sents1[677]) print(sents2[677]) print()
args = parser.parse_args() if args.top_n_remove < 0: parser.error("--top_n_remove Must Be 0 Or Greater") else: n = args.top_n_remove print("Number Of Most Frequent Words To Remove: " + str(args.top_n_remove)) # Download WebText Corpus If Not Already Downloaded nltk.download("webtext") from nltk.corpus import webtext # Break Raw Data Into Individual Reviews wine_reviews_raw = webtext.raw("wine.txt").split("\n") # Used To Remove Non-Text Data translator = str.maketrans('', '', string.punctuation) ################ # Bag Of Words # ################ cleaned_review_data = [] review_labels = [] good_review_freq = nltk.FreqDist() bad_review_freq = nltk.FreqDist() for review in wine_reviews_raw:
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize from nltk.corpus import webtext text = webtext.raw('overheard.txt') sent_tokenizer = PunktSentenceTokenizer(text) sents1 = sent_tokenizer.tokenize(text) print(sents1[678]) sents2= sent_tokenize(text) print(sents2[678]) with open('D:\Python\Data\overheard.txt',encoding='ISO-8859-2') as f: text2 = f.read() sent_tokenizer3 = PunktSentenceTokenizer(text2) sents3 = sent_tokenizer3.tokenize(text) print(sents3[0])
"vpon", "our", "Battlements", ]] longest_sentence = " ".join(longest_sentence[0]) print(longest_sentence) # Doubtfull it stood , As two spent Swimmers , that doe cling together , And choake their Art : The mercilesse Macdonwald ( Worthie to be a Rebell , for to that The multiplying Villanies of Nature Doe swarme vpon him ) from the Westerne Isles Of Kernes and Gallowgrosses is supply ' d , And Fortune on his damned Quarry smiling , Shew ' d like a Rebells W***e : but all ' s too weake : For braue Macbeth ( well hee deserues that Name ) Disdayning Fortune , with his brandisht Steele , Which smoak ' d with bloody execution ( Like Valours Minion ) caru ' d out his passage , Till hee fac ' d the Slaue : Which neu ' r shooke hands , nor bad farwell to him , Till he vnseam ' d him from the Naue toth ' Chops , And fix ' d his Head vpon our Battlements # 1.2 Web and Chat Text from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], "...") """ output firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ... grail.txt SCENE 1: [wind] [clop clop clop] KING ARTHUR: Whoa there! [clop ... overheard.txt White guy: So, do you have any plans for this evening? Asian girl ... pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ... singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ... wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ... """ from nltk.corpus import nps_chat chatroom = nps_chat.posts("10-19-20s_706posts.xml")
print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid) #%% sentences = gutenberg.sents('shakespeare-macbeth.txt') sentences sentences[1037] #最长的句子 long = max([len(s) for s in sentences]) [s for s in sentences if len(s) == long] #%% #网络和聊天文本 from nltk.corpus import webtext webtext.fileids() for fileid in webtext.fileids(): print(fileid,webtext.raw(fileid)[:60]) #%% #布朗语料库 from nltk.corpus import brown brown.categories() brown.words(categories='news') news_words = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_words]) modals = ['can','could','may','might','must','will'] for m in modals: print(m,fdist[m]) #%%
emma = gutenberg.words('austen-emma.txt') #%% for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set(w.lower() for w in gutenberg.words(fileid))) print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid) #%% #Web and Chat Text from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:100], '...') #%% from nltk.corpus import brown news_text = brown.words(categories='news') fdist = FreqDist(w.lower() for w in news_text) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: #print(m + ':', fdist[m], ) print m, ':', fdist[m], ' ', #%% #conditional frequencies cdf = ConditionalFreqDist( (genre, word)
nltk.download() ''' Tokenization What: Separate text into units such as sentences or words Why: Gives structure to previously unstructured text Notes: Relatively easy with English language text, not easy with some languages ''' # "corpus" = collection of documents # "corpora" = plural form of corpus from nltk.corpus import webtext webtext.fileids() # wine reviews corpus text = webtext.raw('wine.txt') text[:500] # tokenize into sentences sentences = [sent for sent in nltk.sent_tokenize(text)] sentences[:10] # tokenize into words tokens = [word for word in nltk.word_tokenize(text)] tokens[:100] # only keep tokens that start with a letter (using regular expressions) import re clean_tokens = [token for token in tokens if re.search(r'^[a-zA-Z]+', token)] clean_tokens[:100]
import nltk print(nltk.corpus.gutenberg.fileids()) emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt')) print(emma.concordance("surprize")) from nltk.corpus import gutenberg for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid) # 网络和聊天文本 from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') print(chatroom[123])
# print(brown.words()) # print(brown.fileids()) # print(brown.raw('cr08').strip()[:1000]) from nltk.corpus import webtext import re # print(webtext.fileids()) # Each line is one advertisement. # for i, line in enumerate(webtext.raw('singles.txt').split('\n')): # if i > 10: # Lets take a look at the first 10 ads. # break # print(str(i) + ':\t' + line) import pandas as pd single_no8 = webtext.raw('singles.txt').split('\n')[8] # print(single_no8) # import nltk # nltk.download('punkt') # nltk.download('stopwords') # Sentence Tokenization from nltk import sent_tokenize, word_tokenize # print(sent_tokenize(single_no8)) # for sent in sent_tokenize(single_no8): # print(word_tokenize(sent)) # for sent in sent_tokenize(single_no8): # It's a little in efficient to loop through each word, # after but sometimes it helps to get better tokens. # print([word.lower() for word in word_tokenize(sent)])
def fun3(): from nltk.corpus import webtext for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65]
fileid)) macbeth_sentences = gutenberg.sents("shakespeare-macbeth.txt") print("macbeth_sentences= ", macbeth_sentences) print("macbeth_sentences[1037]= ", macbeth_sentences[1037]) longest_len = max([len(s) for s in macbeth_sentences]) longest_sent = [s for s in macbeth_sentences if len(s) == longest_len] print("longest_sent= ", longest_sent) # 2.1.2. 网络文本 和 聊天文本 # 网络文本 from nltk.corpus import webtext for field in webtext.fileids(): print(field, webtext.raw(field)[:65], '...') # 聊天文本 from nltk.corpus import nps_chat for field in nps_chat.fileids(): print(field, nps_chat.posts(field)[:12]) chatroom = nps_chat.posts('10-19-20s_706posts.xml') print("chatroom[123]= ", chatroom[123]) # 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究) from nltk.corpus import brown show_subtitle("使用 categories 区分文本") print("brown.categories() =", brown.categories())
from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65]) # IM chat sessions from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') print(chatroom[123])
from nltk.corpus import webtext from nltk.corpus import nps_chat for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65]) text = webtext.raw('firefox.txt') print([i for i in range(len(text)) if text.startswith('a', i)]) chatroom = nps_chat.posts('10-19-20s_706posts.xml') print(chatroom[123]) text2 = nps_chat.raw('11-09-teens_706posts.xml')
#Return the max len of sentences longest_len = max([len(s) for s in macbeth_sentences]) #Save the sentences biggest longest_sent = [s for s in macbeth_sentences if len(s) == longest_len] #******************************************************************************************************** # Web and Chat Text #******************************************************************************************************** ''' Web Texts ''' from nltk.corpus import webtext for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65] ''' Chats ''' from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] #******************************************************************************************************** # Brown Corpus #******************************************************************************************************** from nltk.corpus import brown Displsy the categories that it have.
import nltk from nltk.corpus import webtext #Remember, this is for funz, but later make a wine recommender - either pick new wine or learn what you like ( I like these wines -> you have this taste preference #generate a mix of presidential addresses and genesis - corpus.state_union && corpus.genesis srcLen = webtext.raw('singles.txt') print 'Length of sezzy shingles waiting for you: {0}'.format(srcLen)
def print_private(): from nltk.corpus import webtext for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65]
macbeth_sentences[1116] # In[64]: longest_len = max(len(s) for s in macbeth_sentences) [s for s in macbeth_sentences if len(s) == longest_len] # In[65]: from nltk.corpus import webtext for f in webtext.fileids(): print(f, webtext.raw(f)[:65], "....") # In[66]: from nltk.corpus import brown brown.categories() # In[67]: gov_text = brown.words(categories = 'government')
def wine_reviews() -> Iterable[str]: reviews = webtext.raw('wine.txt') for match in re.finditer(r'(.*)\n', reviews): if match.group(0).strip(): yield match.group(0)
import nltk import pprint print "****** gutenberg" from nltk.corpus import gutenberg print gutenberg.fileids() print "raw: ", len(gutenberg.raw()) print "words: ", len(gutenberg.words()) print "sents: ", len(gutenberg.sents()) print "****** webtext" from nltk.corpus import webtext print len(webtext.raw('firefox.txt')) print "****** nps_chat" from nltk.corpus import nps_chat print nps_chat.fileids() cr=nps_chat.posts('10-19-20s_706posts.xml') print cr print "****** brown" from nltk.corpus import brown nt=brown.words(categories='news') print nt from nltk.corpus import reuters from nltk.corpus import inaugural print [w for w in nltk.corpus.udhr.fileids() if 'heb' in w.lower()] print nltk.corpus.brown.readme()