コード例 #1
0
ファイル: CustomCorpora.py プロジェクト: AbhideepRND/NLTK
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
コード例 #2
0
ファイル: pos_corpus.py プロジェクト: neuroph12/nlpy
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
コード例 #3
0
ファイル: pos_corpus.py プロジェクト: anderscui/nlpy
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
コード例 #4
0
########## TAGGED CORPUS READER ###############

from nltk.corpus.reader import TaggedCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
file="brown.pos"
source=root+file

#Using Regex to match all files with extension .pos
reader=TaggedCorpusReader(root,r'.*\.pos')

print reader.words()
print reader.tagged_words()
print reader.sents()
print reader.tagged_sents()
print reader.paras()
print reader.tagged_paras()


#TaggedCorpus uses default tokenizer but we can change it by customizing it
from nltk.tokenize import SpaceTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer())
print reader.words()

#Customing TaggedCorpus's sentence tokenizer
from nltk.tokenize import LineTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer())
print reader.words()

#Customizing TaggedCorpus's paragraph Block reader
#Customizing TaggedCorpus's tag separator - Pg 57