コード例 #1
0
# load_multiple_corpus_files('docs\\brown_hw\\Test\\')


# TaggedCorpusReader for train set
train_root = 'docs\\brown_hw_lowercase\\Train'
train_reader = TaggedCorpusReader(train_root, '.*')

train_words = train_reader.words()

train_word_counts = nltk.FreqDist(train_words)

# Words of train set that occurs only once
train_word_counts_1 = keys_by_value(train_word_counts)

# Words with tags of train set
tagged_words_with_unk = [list(x) for x in train_reader.tagged_words()]

# Makes words 'UNK' whose counts are one
for index, tagged_word in enumerate(tagged_words_with_unk):
    if tagged_word[0] in train_word_counts_1.keys():
        tagged_words_with_unk[index][0] = 'UNK'

# Computes tag frequencies and put them into dictionary
tag_frequency = {}
for index, tagged_word in enumerate(tagged_words_with_unk):
    try:
        tag_frequency[tagged_word[1]] += 1
    except KeyError:
        tag_frequency[tagged_word[1]] = 1

コード例 #2
0
# Corpus texto simples
from nltk.corpus import PlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/'
corpus1 = PlaintextCorpusReader(loc, '.*\.txt')
print(corpus1.fileids())
print(corpus1.sents())
print(corpus1.words())

# Corpus texto etiquetado
from nltk.corpus.reader.tagged import TaggedCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/'
corpus2 = TaggedCorpusReader(loc, '.*\.txt')
print(corpus2.fileids())
print(corpus2.words())
print("Palavras etiquetadas: ", corpus2.tagged_words())
print(corpus2.tagged_words('003.txt'))
print("Sentencas diretas:")
for s in corpus2.sents():
    print(' '.join(s))

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/'
corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt")
print(corpus3.fileids())
print(corpus3.categories())
print(corpus3.words(categories='brasnam'))

# Definicao de stopwords
stopwords = nltk.corpus.stopwords.words('portuguese')
fd = nltk.FreqDist(w.lower() for w in corpus3.words())