コード例 #1
0
    def readCorpusList(self):
        reader = ChunkedCorpusReader('C:/nltk_data/corpora/cookbook',
                                     r'.*\.chunk')

        for chunk in reader.chunked_words():
            try:
                print(chunk.leaves())
            except:
                print(chunk)
コード例 #2
0
 def chunked_paras(self, fileids=None, categories=None):
     return ChunkedCorpusReader.chunked_paras(
         self, self._resolve(fileids, categories))
コード例 #3
0
 def tagged_sents(self, fileids=None, categories=None):
     return ChunkedCorpusReader.tagged_sents(
         self, self._resolve(fileids, categories))
コード例 #4
0
 def words(self, fileids=None, categories=None):
     return ChunkedCorpusReader.words(self,
                                      self._resolve(fileids, categories))
コード例 #5
0
 def __init__(self, *args, **kwargs):
     CategorizedCorpusReader.__init__(self, kwargs)
     ChunkedCorpusReader.__init__(self, *args, **kwargs)
コード例 #6
0
ファイル: catchunked.py プロジェクト: andacsafa/nltk_book
 def sents(self, fileids=None, categories=None):
     return ChunkedCorpusReader.sents(self,
                                      self_resolve(fileids, categories))
コード例 #7
0
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
print(reader.tagged_words())
print(reader.tagged_sents())
print(
    reader.tagged_words(tagset='universal')
)  ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK

## Reading chunk corpora #######
reader = ChunkedCorpusReader('/Users/atul/nltk_data',
                             r'treebank.chunk',
                             tagset='en-brown')
print(reader.chunked_words())  ## Word level structure
print(reader.chunked_sents())  ## Sentence level structure
print(reader.chunked_paras())  ## Paragraph level structure

## Reading classifed corpora ##################
## classification extracted using cat_pattern (from file name), or cat_dict or cat_file ######
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    '/Users/atul/nltk_data', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt'
)  ## Easiest is to read files for different category
reader.categories()
reader.fileids(categories=['neg'])
reader.fileids(categories=['pos'])
コード例 #8
0
ファイル: catchunked.py プロジェクト: andacsafa/nltk_book
 def tagged_sents(self, fileids=None, categories=None):
     return ChunkedCorpusReader.tagged_sents(self,self_resolve(fileids,
         categories))
	def chunked_paras(self, fileids=None, categories=None):
		return ChunkedCorpusReader.chunked_paras(
			self, self._resolve(fileids, categories))
	def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
		return ChunkedCorpusReader.tagged_paras(
			self, self._resolve(fileids, categories), simplify_tags)
	def sents(self, fileids=None, categories=None):
		return ChunkedCorpusReader.sents(self, self._resolve(fileids, categories))
コード例 #12
0
########## CHUNKED CORPUS READER ###############

###Implementing CCR
from nltk.corpus.reader import ChunkedCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"

reader=ChunkedCorpusReader(root,r'.*\.chunk')
#Each chunk-represented in braces is considered as a word
print reader.chunked_words()
#Each sentence will be included in a Tree()
print reader.chunked_sents()
print reader.chunked_paras()

#Getting tagged tokens for each chunk (each chunk is a word but each word is not a chunk)
print reader.chunked_words()[0].leaves()
print reader.chunked_sents()[1].leaves()
#Cant apply leaves directly to a para - but we can access a sentence of a given para.
print reader.chunked_para()[0][0].leaves()
###Implementing CCCR
from nltk.corpus.reader import ConllChunkCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"

reader=ConllChunkCorpusReader(root,r'.*\.iob',('NP','VP'.'PP'))
print reader.chunked_words()
print reader.chunked_sents()
print reader.iob_words()
print reader.iob_sents()
コード例 #13
0
from nltk.corpus.reader import ChunkedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = ChunkedCorpusReader(d, r'.*\.chunk')
print(reader.chunked_words())
print(reader.chunked_sents())
print(reader.chunked_paras())

# reader.chunked_sents()[0].draw()
print(reader.chunked_sents()[0].leaves())
コード例 #14
0
def createChunker():
    chunks = ChunkedCorpusReader('data/chunks/','text_search.pos')
    tagger_classes = [UnigramTagger, BigramTagger]
    train_chunks = chunks.chunked_sents()
    chunker = TagChunker(train_chunks, tagger_classes)
    return chunker
コード例 #15
0
ファイル: BguCorpusReader.py プロジェクト: jedimonster/nlp
 def __init__(self, directory="",fileids=r"haaretz.bgu",myEncoding="utf-8"):
     ChunkedCorpusReader.__init__(self, directory ,fileids , str2chunktree=self.__str2BguTree,sent_tokenizer=RegexpTokenizer('\n\n', gaps=True),encoding=myEncoding)
     self._format = format
	def __init__(self, *args, **kwargs):
		CategorizedCorpusReader.__init__(self, kwargs)
		ChunkedCorpusReader.__init__(self, *args, **kwargs)
コード例 #17
0
 def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
     return ChunkedCorpusReader.tagged_paras(
         self, self._resolve(fileids, categories), simplify_tags)