コード例 #1
0
 def __init__(self):
     classes = "contos critica cronica miscelanea poesia romance teatro".split()
     self.token = self.punct = self.clauses = self.patterns = ''
     self.punctuation = list(",.:;!?") + ["CC", "--"]
     self.punctuate = list(",.:;!?-")
     # _text = self.text_setup(text5)
     # self.split_clauses(_text)
     # self.plot_clause('a')
     self.classes = util.Mont().mont_symbol_pt()
     self.classes.update({pt: "\033[1;33m{}\033[1;0m".format(pt) for pt in self.punctuate})
     self.marker()
     self.gens = {gen: [machado.words(txid) for txid in machado.fileids() if gen in txid] for gen in classes}
     self.texts = []
     self.legends = []
     self._patt_data = []
     self._patt_labels = []
     for gen in classes:
         self.legends.extend([gen]*GEN_CNT)
         self.texts.extend([tx for tx in self.gens[gen]][:GEN_CNT])
     # self.texts = [machado.words(conto) for conto in machado.fileids() if "contos" in conto][:20]
     for txt in self.texts:
         print(txt[1000:1004])
コード例 #2
0
ファイル: pt.py プロジェクト: ciju/yql_hash
# Author: Steven Bird <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')

def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
コード例 #3
0
    'English: NPS Chat Corpus':
    lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}


class CollocationsView:
    _BACKGROUND_COLOUR = '#FFF'  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
コード例 #4
0
from nltk.corpus import machado, stopwords
import nltk
# nltk.download('machado')
print(machado.readme())

#2.a Categorias presentes no corpus
print(machado.categories())

#2.b
print(machado.fileids())

#2.c
arq = 'romance/marm05.txt'
palavras = machado.words([arq])
print(palavras)

#2.d
fdist = nltk.FreqDist(palavras)
for p in ['olhos', 'estado']:
    print(f'Arquivo {arq} e frequência da palavra {p} {fdist[p]}')

#2.e
print(len(palavras))

#2.f
print(len(fdist.keys()))

#2.g
print(fdist.keys())

#2.h
コード例 #5
0
# ATIVIDADE: EXERCITANDO 3 - PARTE 02
# AUTOR: Paulo Gamero

import os
import pandas as pd
import nltk
from nltk.corpus import machado, stopwords

# 1) Execute print(machado.readme()) para conhecer melhor o corpus
print(machado.readme())

#LETRA A (Classifique as palavras de acordo com suas classes gramaticais de cada documento. Salve o corpus POS Tagged em uma planilha ou texto para uso posterior. É importante manter a informação sobre o documento origem dos novos documentos)
arq_casmurro = 'romance/marm08.txt'
arq_alienista= 'contos/macn003.txt'

casmurro = machado.words([arq_casmurro])
alienista = machado.words([arq_alienista])

stopword = stopwords.words('portuguese')
minha_stop_w = [',','[',':', '\'','.','...','?',']', '!','/','-',';','\x97','(',')','\x94.','\x93','\x92',]

casmurro_clear = [p for p in casmurro if (p not in stopword) and (p not in minha_stop_w)]
alienista_clear= [p for p in alienista if (p not in stopword) and (p not in minha_stop_w)][198:10232]

# Classes gramaticais (Tagged)
pos_tags_casmurro = nltk.pos_tag(casmurro_clear)
pos_tags_alienista= nltk.pos_tag(alienista_clear)

with open(os.getcwd()+'\Paulo_POS_Dom-Casmurro.txt', 'w',encoding='utf8') as txt:
    for p in pos_tags_casmurro:
        txt.write(str(p)+'\n')
コード例 #6
0
ファイル: collocations_app.py プロジェクト: CaptainAL/Spyder
            'English: NPS Chat Corpus':
                lambda: nps_chat.words(),
            'English: Wall Street Journal Corpus':
                lambda: treebank.words(),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.words(),
            'Dutch: Alpino Corpus':
                lambda: alpino.words(),
            'Hindi: Indian Languages Corpus':
                lambda: indian.words(files='hindi.pos'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.words(),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.words(),
            'Portuguese: Machado Corpus (Brazil)':
                lambda: machado.words(),
            'Spanish: CESS-ESP Corpus':
                lambda: cess_esp.words()
           }

class CollocationsView:
    _BACKGROUND_COLOUR='#FFF' #white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
コード例 #7
0
# Author: Steven Bird <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')


def texts():
コード例 #8
0
ファイル: pt.py プロジェクト: jparise/haitwu-appengine
# -*- coding: utf-8 -*-

# Some texts for exploration with Portuguese, cf chapter 1 of the book

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'), name="Mem�rias P�stumas de Br�s Cubas (1881)")
print "ptext1:", ptext1.name

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name

ptext3 = Text(genesis.words('portuguese.txt'), name="G�nesis")
print "ptext3:", ptext3.name

ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name

def texts():
    print "ptext1:", ptext1.name
    print "ptext2:", ptext2.name
    print "ptext3:", ptext3.name
コード例 #9
0
import sys
from nltk.corpus import machado
from tqdm import tqdm


def word_break(vocab, word):
    if len(word) == 0:
        yield []

    else:
        length = len(word)
        for i in range(1, length+1):
            # Have prefix on vocab
            sub_string = word[:i]
            if sub_string not in vocab:
                continue
            for each in word_break(vocab, word[i:]):
                yield [sub_string] + each


if __name__ == "__main__":
    word = sys.argv[1]
    idx = machado.fileids()
    vocab = []
    for i, each in tqdm(enumerate(idx)):
        vocab.extend(machado.words(each))
    vocab = set(vocab)
    vocab = list(vocab)
    for result in word_break(vocab, word):
        print(result)
コード例 #10
0
import report
from time import time
# if 'PyPy' in sys.version:
    # import numpypy #important to make numpy import in NLTK work
import nltk
from nltk import *
from nltk.corpus import machado
from nltk import grammar, parse
from nltk.parse.featurechart import InstantiateVarsChart

sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle')
raw_text1 = machado.raw('romance/marm05.txt')
raw_text2 = machado.raw('romance/marm04.txt')
raw_text3 = machado.raw('romance/marm03.txt')

ptext1 = nltk.Text(machado.words('romance/marm01.txt'))
ptext2 = nltk.Text(machado.words('romance/marm02.txt'))
ptext3 = nltk.Text(machado.words('romance/marm03.txt'))
ptext4 = nltk.Text(machado.words('romance/marm04.txt'))

cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1)
stemmer = nltk.stem.RSLPStemmer()

## Checking version of the benchmarking
if 'PyPy' in sys.version:
    version = 'PyPy {}'.format(sys.version)
else:
    version = 'CPython {}'.format(sys.version)

report.setup('PyPy' in version)
コード例 #11
0
def stemming_bench():
    [stemmer.stem(w) for w in machado.words('romance/marm05.txt')]
コード例 #12
0
# The corpus of nltk offers us a huge collection of data. Some of that data are famous writers text. the following program imports different collections and shows you how to consult the file you want.

from nltk.corpus import gutenberg, machado, movie_reviews

# print(gutenberg.fileids())
print(machado.fileids())
# print(movie_reviews.fileids())

machado_teatro2 = machado.words("teatro/matt02.txt")
print(len(machado_teatro2))

# Check the length of austen-emma text that's inside gutenberg's corpora.
コード例 #13
0
# First 22 lines or so of this file were taken from:
# http://www.nltk.org/_modules/nltk/examples/pt.html

from nltk.corpus import machado, mac_morpho, floresta, genesis
from nltk.text import Text, ConcordanceIndex
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell
from portuguese.models import Word
import datetime

print("Type: 'texts()' to list the materials.")

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sao Paulo (1994)")

machado_fileids = machado.fileids()
machado_words = machado.words(
    ['romance/marm05.txt', 'cronica/macr04.txt', 'critica/mact15.txt'])
#machado_words = machado.words(machado_fileids) + mac_morpho.words('mu94se01.txt') + genesis.words('portuguese.txt')
machado_text = Text(machado_words)
machado_ci = ConcordanceIndex(machado_text)


def texts():
    print("ptext1:", ptext1.name)
    print("ptext2:", ptext2.name)
コード例 #14
0
from nltk.corpus import machado, stopwords
import nltk

# 1) Execute print(machado.readme()) para conhecer melhor o corpus
print(machado.readme())

# 2) Utilizando o corpus machado, elabore um programa que atenda aos requisitos:
# LETRA A
print(machado.categories())

# LETRA B
print(machado.fileids())

# LETRA C
arq = 'romance/marm05.txt'
words = machado.words([arq])
print(words)

# LETRA D
fdist = nltk.FreqDist(words)
for p in ['olhos', 'estado']:
    print(f'Arquivo {arq} e frequência da palavra {p} {fdist[p]}')

# LETRA E
print(f'Existem {len(words)} palavras no texto')

# LETRA F
print(f'São {len(fdist.keys())} palavras diferentes')

# LETRA G
print(f'O Vocabulário é {fdist.keys()}')
コード例 #15
0
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)
コード例 #16
0
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
        self._init_widgets(self.top)
        self.load_corpus(self.model.DEFAULT_CORPUS)