def __init__(self): classes = "contos critica cronica miscelanea poesia romance teatro".split() self.token = self.punct = self.clauses = self.patterns = '' self.punctuation = list(",.:;!?") + ["CC", "--"] self.punctuate = list(",.:;!?-") # _text = self.text_setup(text5) # self.split_clauses(_text) # self.plot_clause('a') self.classes = util.Mont().mont_symbol_pt() self.classes.update({pt: "\033[1;33m{}\033[1;0m".format(pt) for pt in self.punctuate}) self.marker() self.gens = {gen: [machado.words(txid) for txid in machado.fileids() if gen in txid] for gen in classes} self.texts = [] self.legends = [] self._patt_data = [] self._patt_labels = [] for gen in classes: self.legends.extend([gen]*GEN_CNT) self.texts.extend([tx for tx in self.gens[gen]][:GEN_CNT]) # self.texts = [machado.words(conto) for conto in machado.fileids() if "contos" in conto][:20] for txt in self.texts: print(txt[1000:1004])
# Author: Steven Bird <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT from nltk.corpus import machado, mac_morpho, floresta, genesis from nltk.text import Text from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading ptext1, ... and psent1, ..." print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)") print "ptext1:", ptext1.name.decode('latin-1') ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)") print "ptext2:", ptext2.name.decode('latin-1') ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis") print "ptext3:", ptext3.name.decode('latin-1') ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)") print "ptext4:", ptext4.name.decode('latin-1') def texts(): print "ptext1:", ptext1.name.decode('latin-1') print "ptext2:", ptext2.name.decode('latin-1') print "ptext3:", ptext3.name.decode('latin-1')
'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR = '#FFF' # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top)
from nltk.corpus import machado, stopwords import nltk # nltk.download('machado') print(machado.readme()) #2.a Categorias presentes no corpus print(machado.categories()) #2.b print(machado.fileids()) #2.c arq = 'romance/marm05.txt' palavras = machado.words([arq]) print(palavras) #2.d fdist = nltk.FreqDist(palavras) for p in ['olhos', 'estado']: print(f'Arquivo {arq} e frequência da palavra {p} {fdist[p]}') #2.e print(len(palavras)) #2.f print(len(fdist.keys())) #2.g print(fdist.keys()) #2.h
# ATIVIDADE: EXERCITANDO 3 - PARTE 02 # AUTOR: Paulo Gamero import os import pandas as pd import nltk from nltk.corpus import machado, stopwords # 1) Execute print(machado.readme()) para conhecer melhor o corpus print(machado.readme()) #LETRA A (Classifique as palavras de acordo com suas classes gramaticais de cada documento. Salve o corpus POS Tagged em uma planilha ou texto para uso posterior. É importante manter a informação sobre o documento origem dos novos documentos) arq_casmurro = 'romance/marm08.txt' arq_alienista= 'contos/macn003.txt' casmurro = machado.words([arq_casmurro]) alienista = machado.words([arq_alienista]) stopword = stopwords.words('portuguese') minha_stop_w = [',','[',':', '\'','.','...','?',']', '!','/','-',';','\x97','(',')','\x94.','\x93','\x92',] casmurro_clear = [p for p in casmurro if (p not in stopword) and (p not in minha_stop_w)] alienista_clear= [p for p in alienista if (p not in stopword) and (p not in minha_stop_w)][198:10232] # Classes gramaticais (Tagged) pos_tags_casmurro = nltk.pos_tag(casmurro_clear) pos_tags_alienista= nltk.pos_tag(alienista_clear) with open(os.getcwd()+'\Paulo_POS_Dom-Casmurro.txt', 'w',encoding='utf8') as txt: for p in pos_tags_casmurro: txt.write(str(p)+'\n')
'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR='#FFF' #white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS)
# Author: Steven Bird <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT from nltk.corpus import machado, mac_morpho, floresta, genesis from nltk.text import Text from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading ptext1, ... and psent1, ..." print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)") print "ptext1:", ptext1.name.decode('latin-1') ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)") print "ptext2:", ptext2.name.decode('latin-1') ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis") print "ptext3:", ptext3.name.decode('latin-1') ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)") print "ptext4:", ptext4.name.decode('latin-1') def texts():
# -*- coding: utf-8 -*- # Some texts for exploration with Portuguese, cf chapter 1 of the book from nltk.corpus import machado, mac_morpho, floresta, genesis from nltk.text import Text from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell print "*** Introductory Examples for the NLTK Book ***" print "Loading ptext1, ... and psent1, ..." print "Type the name of the text or sentence to view it." print "Type: 'texts()' or 'sents()' to list the materials." ptext1 = Text(machado.words('romance/marm05.txt'), name="Mem�rias P�stumas de Br�s Cubas (1881)") print "ptext1:", ptext1.name ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)") print "ptext2:", ptext2.name ptext3 = Text(genesis.words('portuguese.txt'), name="G�nesis") print "ptext3:", ptext3.name ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)") print "ptext4:", ptext4.name def texts(): print "ptext1:", ptext1.name print "ptext2:", ptext2.name print "ptext3:", ptext3.name
import sys from nltk.corpus import machado from tqdm import tqdm def word_break(vocab, word): if len(word) == 0: yield [] else: length = len(word) for i in range(1, length+1): # Have prefix on vocab sub_string = word[:i] if sub_string not in vocab: continue for each in word_break(vocab, word[i:]): yield [sub_string] + each if __name__ == "__main__": word = sys.argv[1] idx = machado.fileids() vocab = [] for i, each in tqdm(enumerate(idx)): vocab.extend(machado.words(each)) vocab = set(vocab) vocab = list(vocab) for result in word_break(vocab, word): print(result)
import report from time import time # if 'PyPy' in sys.version: # import numpypy #important to make numpy import in NLTK work import nltk from nltk import * from nltk.corpus import machado from nltk import grammar, parse from nltk.parse.featurechart import InstantiateVarsChart sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle') raw_text1 = machado.raw('romance/marm05.txt') raw_text2 = machado.raw('romance/marm04.txt') raw_text3 = machado.raw('romance/marm03.txt') ptext1 = nltk.Text(machado.words('romance/marm01.txt')) ptext2 = nltk.Text(machado.words('romance/marm02.txt')) ptext3 = nltk.Text(machado.words('romance/marm03.txt')) ptext4 = nltk.Text(machado.words('romance/marm04.txt')) cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1) stemmer = nltk.stem.RSLPStemmer() ## Checking version of the benchmarking if 'PyPy' in sys.version: version = 'PyPy {}'.format(sys.version) else: version = 'CPython {}'.format(sys.version) report.setup('PyPy' in version)
def stemming_bench(): [stemmer.stem(w) for w in machado.words('romance/marm05.txt')]
# The corpus of nltk offers us a huge collection of data. Some of that data are famous writers text. the following program imports different collections and shows you how to consult the file you want. from nltk.corpus import gutenberg, machado, movie_reviews # print(gutenberg.fileids()) print(machado.fileids()) # print(movie_reviews.fileids()) machado_teatro2 = machado.words("teatro/matt02.txt") print(len(machado_teatro2)) # Check the length of austen-emma text that's inside gutenberg's corpora.
# First 22 lines or so of this file were taken from: # http://www.nltk.org/_modules/nltk/examples/pt.html from nltk.corpus import machado, mac_morpho, floresta, genesis from nltk.text import Text, ConcordanceIndex from nltk.probability import FreqDist from nltk.util import bigrams from nltk.misc import babelize_shell from portuguese.models import Word import datetime print("Type: 'texts()' to list the materials.") ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)") ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)") ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis") ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sao Paulo (1994)") machado_fileids = machado.fileids() machado_words = machado.words( ['romance/marm05.txt', 'cronica/macr04.txt', 'critica/mact15.txt']) #machado_words = machado.words(machado_fileids) + mac_morpho.words('mu94se01.txt') + genesis.words('portuguese.txt') machado_text = Text(machado_words) machado_ci = ConcordanceIndex(machado_text) def texts(): print("ptext1:", ptext1.name) print("ptext2:", ptext2.name)
from nltk.corpus import machado, stopwords import nltk # 1) Execute print(machado.readme()) para conhecer melhor o corpus print(machado.readme()) # 2) Utilizando o corpus machado, elabore um programa que atenda aos requisitos: # LETRA A print(machado.categories()) # LETRA B print(machado.fileids()) # LETRA C arq = 'romance/marm05.txt' words = machado.words([arq]) print(words) # LETRA D fdist = nltk.FreqDist(words) for p in ['olhos', 'estado']: print(f'Arquivo {arq} e frequência da palavra {p} {fdist[p]}') # LETRA E print(f'Existem {len(words)} palavras no texto') # LETRA F print(f'São {len(fdist.keys())} palavras diferentes') # LETRA G print(f'O Vocabulário é {fdist.keys()}')
"Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS)
), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS)