Example #1
0
 def __init__(self):
     classes = "contos critica cronica miscelanea poesia romance teatro".split()
     self.token = self.punct = self.clauses = self.patterns = ''
     self.punctuation = list(",.:;!?") + ["CC", "--"]
     self.punctuate = list(",.:;!?-")
     # _text = self.text_setup(text5)
     # self.split_clauses(_text)
     # self.plot_clause('a')
     self.classes = util.Mont().mont_symbol_pt()
     self.classes.update({pt: "\033[1;33m{}\033[1;0m".format(pt) for pt in self.punctuate})
     self.marker()
     self.gens = {gen: [machado.words(txid) for txid in machado.fileids() if gen in txid] for gen in classes}
     self.texts = []
     self.legends = []
     self._patt_data = []
     self._patt_labels = []
     for gen in classes:
         self.legends.extend([gen]*GEN_CNT)
         self.texts.extend([tx for tx in self.gens[gen]][:GEN_CNT])
     # self.texts = [machado.words(conto) for conto in machado.fileids() if "contos" in conto][:20]
     for txt in self.texts:
         print(txt[1000:1004])
Example #2
0
File: pt.py Project: ciju/yql_hash
# Author: Steven Bird <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')

def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
Example #3
0
    'English: NPS Chat Corpus':
    lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}


class CollocationsView:
    _BACKGROUND_COLOUR = '#FFF'  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
Example #4
0
from nltk.corpus import machado, stopwords
import nltk
# nltk.download('machado')
print(machado.readme())

#2.a Categorias presentes no corpus
print(machado.categories())

#2.b
print(machado.fileids())

#2.c
arq = 'romance/marm05.txt'
palavras = machado.words([arq])
print(palavras)

#2.d
fdist = nltk.FreqDist(palavras)
for p in ['olhos', 'estado']:
    print(f'Arquivo {arq} e frequência da palavra {p} {fdist[p]}')

#2.e
print(len(palavras))

#2.f
print(len(fdist.keys()))

#2.g
print(fdist.keys())

#2.h
Example #5
0
# ATIVIDADE: EXERCITANDO 3 - PARTE 02
# AUTOR: Paulo Gamero

import os
import pandas as pd
import nltk
from nltk.corpus import machado, stopwords

# 1) Execute print(machado.readme()) para conhecer melhor o corpus
print(machado.readme())

#LETRA A (Classifique as palavras de acordo com suas classes gramaticais de cada documento. Salve o corpus POS Tagged em uma planilha ou texto para uso posterior. É importante manter a informação sobre o documento origem dos novos documentos)
arq_casmurro = 'romance/marm08.txt'
arq_alienista= 'contos/macn003.txt'

casmurro = machado.words([arq_casmurro])
alienista = machado.words([arq_alienista])

stopword = stopwords.words('portuguese')
minha_stop_w = [',','[',':', '\'','.','...','?',']', '!','/','-',';','\x97','(',')','\x94.','\x93','\x92',]

casmurro_clear = [p for p in casmurro if (p not in stopword) and (p not in minha_stop_w)]
alienista_clear= [p for p in alienista if (p not in stopword) and (p not in minha_stop_w)][198:10232]

# Classes gramaticais (Tagged)
pos_tags_casmurro = nltk.pos_tag(casmurro_clear)
pos_tags_alienista= nltk.pos_tag(alienista_clear)

with open(os.getcwd()+'\Paulo_POS_Dom-Casmurro.txt', 'w',encoding='utf8') as txt:
    for p in pos_tags_casmurro:
        txt.write(str(p)+'\n')
Example #6
0
            'English: NPS Chat Corpus':
                lambda: nps_chat.words(),
            'English: Wall Street Journal Corpus':
                lambda: treebank.words(),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.words(),
            'Dutch: Alpino Corpus':
                lambda: alpino.words(),
            'Hindi: Indian Languages Corpus':
                lambda: indian.words(files='hindi.pos'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.words(),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.words(),
            'Portuguese: Machado Corpus (Brazil)':
                lambda: machado.words(),
            'Spanish: CESS-ESP Corpus':
                lambda: cess_esp.words()
           }

class CollocationsView:
    _BACKGROUND_COLOUR='#FFF' #white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
Example #7
0
# Author: Steven Bird <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')


def texts():
Example #8
0
# -*- coding: utf-8 -*-

# Some texts for exploration with Portuguese, cf chapter 1 of the book

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'), name="Mem�rias P�stumas de Br�s Cubas (1881)")
print "ptext1:", ptext1.name

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name

ptext3 = Text(genesis.words('portuguese.txt'), name="G�nesis")
print "ptext3:", ptext3.name

ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name

def texts():
    print "ptext1:", ptext1.name
    print "ptext2:", ptext2.name
    print "ptext3:", ptext3.name
Example #9
0
import sys
from nltk.corpus import machado
from tqdm import tqdm


def word_break(vocab, word):
    if len(word) == 0:
        yield []

    else:
        length = len(word)
        for i in range(1, length+1):
            # Have prefix on vocab
            sub_string = word[:i]
            if sub_string not in vocab:
                continue
            for each in word_break(vocab, word[i:]):
                yield [sub_string] + each


if __name__ == "__main__":
    word = sys.argv[1]
    idx = machado.fileids()
    vocab = []
    for i, each in tqdm(enumerate(idx)):
        vocab.extend(machado.words(each))
    vocab = set(vocab)
    vocab = list(vocab)
    for result in word_break(vocab, word):
        print(result)
Example #10
0
import report
from time import time
# if 'PyPy' in sys.version:
    # import numpypy #important to make numpy import in NLTK work
import nltk
from nltk import *
from nltk.corpus import machado
from nltk import grammar, parse
from nltk.parse.featurechart import InstantiateVarsChart

sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle')
raw_text1 = machado.raw('romance/marm05.txt')
raw_text2 = machado.raw('romance/marm04.txt')
raw_text3 = machado.raw('romance/marm03.txt')

ptext1 = nltk.Text(machado.words('romance/marm01.txt'))
ptext2 = nltk.Text(machado.words('romance/marm02.txt'))
ptext3 = nltk.Text(machado.words('romance/marm03.txt'))
ptext4 = nltk.Text(machado.words('romance/marm04.txt'))

cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1)
stemmer = nltk.stem.RSLPStemmer()

## Checking version of the benchmarking
if 'PyPy' in sys.version:
    version = 'PyPy {}'.format(sys.version)
else:
    version = 'CPython {}'.format(sys.version)

report.setup('PyPy' in version)
Example #11
0
def stemming_bench():
    [stemmer.stem(w) for w in machado.words('romance/marm05.txt')]
# The corpus of nltk offers us a huge collection of data. Some of that data are famous writers text. the following program imports different collections and shows you how to consult the file you want.

from nltk.corpus import gutenberg, machado, movie_reviews

# print(gutenberg.fileids())
print(machado.fileids())
# print(movie_reviews.fileids())

machado_teatro2 = machado.words("teatro/matt02.txt")
print(len(machado_teatro2))

# Check the length of austen-emma text that's inside gutenberg's corpora.
Example #13
0
# First 22 lines or so of this file were taken from:
# http://www.nltk.org/_modules/nltk/examples/pt.html

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text, ConcordanceIndex
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell
from portuguese.models import Word
import datetime

print("Type: 'texts()' to list the materials.")

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sao Paulo (1994)")

machado_fileids = machado.fileids()
machado_words = machado.words(
    ['romance/marm05.txt', 'cronica/macr04.txt', 'critica/mact15.txt'])
#machado_words = machado.words(machado_fileids) + mac_morpho.words('mu94se01.txt') + genesis.words('portuguese.txt')
machado_text = Text(machado_words)
machado_ci = ConcordanceIndex(machado_text)


def texts():
    print("ptext1:", ptext1.name)
    print("ptext2:", ptext2.name)
Example #14
0
from nltk.corpus import machado, stopwords
import nltk

# 1) Execute print(machado.readme()) para conhecer melhor o corpus
print(machado.readme())

# 2) Utilizando o corpus machado, elabore um programa que atenda aos requisitos:
# LETRA A
print(machado.categories())

# LETRA B
print(machado.fileids())

# LETRA C
arq = 'romance/marm05.txt'
words = machado.words([arq])
print(words)

# LETRA D
fdist = nltk.FreqDist(words)
for p in ['olhos', 'estado']:
    print(f'Arquivo {arq} e frequência da palavra {p} {fdist[p]}')

# LETRA E
print(f'Existem {len(words)} palavras no texto')

# LETRA F
print(f'São {len(fdist.keys())} palavras diferentes')

# LETRA G
print(f'O Vocabulário é {fdist.keys()}')
Example #15
0
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
Example #16
0
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)