def __init__(self, **property_names): PropertyIndirectionMixIn.__init__(self, **property_names) # A token reader for processing sentences. self._sent_reader = ChunkedTaggedTokenReader(top_node='S', chunk_node='NP', **property_names)
class TreebankTaggedTokenReader(TokenReaderI, PropertyIndirectionMixIn): """ A token reader that reas the treebank tagged-file format into a token. In this format: - Paragraphs are separated by lines of C{'='} characters. - Sentences are separated by words tagged as sentence-final punctuation (e.g., C{'./.'}). - NP chunk structure is encoded with square brackets (C{[...]}). - Words are separated by whitespace or square brackets. - Each word has the form C{I{text}/i{tag}}, where C{I{text}} is the word's text, and C{I{tag}} is its tag. In the returned token: - The returned token describes a single document. - The document's C{SENTS} property contains a list of sentence tokens. - Each sentence token's C{WORDS} property contains a list of word tokens. - Each word token's C{TEXT} property contains the word's text. - Each word token's C{TAG} property contains the word's tag. - Depending on the arguments to the reader's constructor, each word token may also define the C{LOC} and C{CONTEXT} properties. - Each sentence token's C{TREE} property contains the chunk structures in the text. In the case of the Treebank, these chunk structures were generated by a stochastic NP chunker as part of the PARTS preprocessor, and \"are best ignored.\" """ def __init__(self, **property_names): PropertyIndirectionMixIn.__init__(self, **property_names) # A token reader for processing sentences. self._sent_reader = ChunkedTaggedTokenReader(top_node='S', chunk_node='NP', **property_names) def read_token(self, s, add_contexts=False, add_locs=False, source=None): assert chktype(1, s, str) TEXT = self.property('TEXT') LOC = self.property('LOC') CONTEXT = self.property('CONTEXT') SENTS = self.property('SENTS') TREE = self.property('TREE') sentences = re.findall('(?s)\S.*?/\.', s) sent_toks = [] for sent_num, sentence in enumerate(sentences): sent_loc = SentIndexLocation(sent_num, source) sent_tok = self._sent_reader.read_token(sentence, add_contexts=add_contexts, add_locs=add_locs, source=sent_loc) sent_toks.append(sent_tok) tok = Token(**{SENTS: sent_toks}) # Add context pointers, if requested if add_contexts: for i, sent_tok in enumerate(tok[SENTS]): sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i) # Return the finished token. return tok def read_tokens(self, s, source=None): return [self.read_token(s, source)]
class TreebankTaggedTokenReader(TokenReaderI, PropertyIndirectionMixIn): """ A token reader that reas the treebank tagged-file format into a token. In this format: - Paragraphs are separated by lines of C{'='} characters. - Sentences are separated by words tagged as sentence-final punctuation (e.g., C{'./.'}). - NP chunk structure is encoded with square brackets (C{[...]}). - Words are separated by whitespace or square brackets. - Each word has the form C{I{text}/i{tag}}, where C{I{text}} is the word's text, and C{I{tag}} is its tag. In the returned token: - The returned token describes a single document. - The document's C{SENTS} property contains a list of sentence tokens. - Each sentence token's C{WORDS} property contains a list of word tokens. - Each word token's C{TEXT} property contains the word's text. - Each word token's C{TAG} property contains the word's tag. - Depending on the arguments to the reader's constructor, each word token may also define the C{LOC} and C{CONTEXT} properties. - Each sentence token's C{TREE} property contains the chunk structures in the text. In the case of the Treebank, these chunk structures were generated by a stochastic NP chunker as part of the PARTS preprocessor, and \"are best ignored.\" """ def __init__(self, **property_names): PropertyIndirectionMixIn.__init__(self, **property_names) # A token reader for processing sentences. self._sent_reader = ChunkedTaggedTokenReader( top_node='S', chunk_node='NP', **property_names) def read_token(self, s, add_contexts=False, add_locs=False, source=None): assert chktype(1, s, str) TEXT = self.property('TEXT') LOC = self.property('LOC') CONTEXT = self.property('CONTEXT') SENTS = self.property('SENTS') TREE = self.property('TREE') sentences = re.findall('(?s)\S.*?/\.', s) sent_toks = [] for sent_num, sentence in enumerate(sentences): sent_loc = SentIndexLocation(sent_num, source) sent_tok = self._sent_reader.read_token( sentence, add_contexts=add_contexts, add_locs=add_locs, source=sent_loc) sent_toks.append(sent_tok) tok = Token(**{SENTS: sent_toks}) # Add context pointers, if requested if add_contexts: for i, sent_tok in enumerate(tok[SENTS]): sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i) # Return the finished token. return tok def read_tokens(self, s, source=None): return [self.read_token(s, source)]
def __init__(self, **property_names): PropertyIndirectionMixIn.__init__(self, **property_names) # A token reader for processing sentences. self._sent_reader = ChunkedTaggedTokenReader( top_node='S', chunk_node='NP', **property_names)
from nltk.tokenreader.tagged import ChunkedTaggedTokenReader chunked_string = "[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]" reader = ChunkedTaggedTokenReader(chunk_node='NP', SUBTOKENS='WORDS') sent_token = reader.read_token(chunked_string) print sent_token['TREE']