Python ChunkedTaggedTokenReader Examples

Programming Language: Python

Namespace/Package Name: nltk.tokenreader.tagged

Examples at hotexamples.com: 5

Python ChunkedTaggedTokenReader - 5 examples found. These are the top rated real world Python examples of nltk.tokenreader.tagged.ChunkedTaggedTokenReader extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ChunkedTaggedTokenReader(2)

read_token(2)

Example #1

Show file

    def __init__(self, **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(top_node='S',
                                                     chunk_node='NP',
                                                     **property_names)

Example #2

Show file

class TreebankTaggedTokenReader(TokenReaderI, PropertyIndirectionMixIn):
    """
    A token reader that reas the treebank tagged-file format into a
    token.  In this format:

      - Paragraphs are separated by lines of C{'='} characters.
      - Sentences are separated by words tagged as sentence-final
        punctuation (e.g., C{'./.'}).
      - NP chunk structure is encoded with square brackets (C{[...]}).
      - Words are separated by whitespace or square brackets.
      - Each word has the form C{I{text}/i{tag}}, where C{I{text}}
        is the word's text, and C{I{tag}} is its tag.

    In the returned token:
    
      - The returned token describes a single document.
      - The document's C{SENTS} property contains a list of
        sentence tokens.
          - Each sentence token's C{WORDS} property contains a list of
            word tokens.
            - Each word token's C{TEXT} property contains the word's
              text.
            - Each word token's C{TAG} property contains the word's
              tag.
            - Depending on the arguments to the reader's constructor,
              each word token may also define the C{LOC} and
              C{CONTEXT} properties.
          - Each sentence token's C{TREE} property contains the
            chunk structures in the text.  In the case of the Treebank,
            these chunk structures were generated by a stochastic NP
            chunker as part of the PARTS preprocessor, and \"are best
            ignored.\"
    """
    def __init__(self, **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(top_node='S',
                                                     chunk_node='NP',
                                                     **property_names)

    def read_token(self, s, add_contexts=False, add_locs=False, source=None):
        assert chktype(1, s, str)

        TEXT = self.property('TEXT')
        LOC = self.property('LOC')
        CONTEXT = self.property('CONTEXT')
        SENTS = self.property('SENTS')
        TREE = self.property('TREE')

        sentences = re.findall('(?s)\S.*?/\.', s)
        sent_toks = []
        for sent_num, sentence in enumerate(sentences):
            sent_loc = SentIndexLocation(sent_num, source)
            sent_tok = self._sent_reader.read_token(sentence,
                                                    add_contexts=add_contexts,
                                                    add_locs=add_locs,
                                                    source=sent_loc)
            sent_toks.append(sent_tok)
        tok = Token(**{SENTS: sent_toks})

        # Add context pointers, if requested
        if add_contexts:
            for i, sent_tok in enumerate(tok[SENTS]):
                sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i)

        # Return the finished token.
        return tok

    def read_tokens(self, s, source=None):
        return [self.read_token(s, source)]

Example #3

Show file

File: treebank.py Project: ronaldahmed/robot-navigation

class TreebankTaggedTokenReader(TokenReaderI, PropertyIndirectionMixIn):
    """
    A token reader that reas the treebank tagged-file format into a
    token.  In this format:

      - Paragraphs are separated by lines of C{'='} characters.
      - Sentences are separated by words tagged as sentence-final
        punctuation (e.g., C{'./.'}).
      - NP chunk structure is encoded with square brackets (C{[...]}).
      - Words are separated by whitespace or square brackets.
      - Each word has the form C{I{text}/i{tag}}, where C{I{text}}
        is the word's text, and C{I{tag}} is its tag.

    In the returned token:
    
      - The returned token describes a single document.
      - The document's C{SENTS} property contains a list of
        sentence tokens.
          - Each sentence token's C{WORDS} property contains a list of
            word tokens.
            - Each word token's C{TEXT} property contains the word's
              text.
            - Each word token's C{TAG} property contains the word's
              tag.
            - Depending on the arguments to the reader's constructor,
              each word token may also define the C{LOC} and
              C{CONTEXT} properties.
          - Each sentence token's C{TREE} property contains the
            chunk structures in the text.  In the case of the Treebank,
            these chunk structures were generated by a stochastic NP
            chunker as part of the PARTS preprocessor, and \"are best
            ignored.\"
    """
    def __init__(self,  **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(
            top_node='S', chunk_node='NP', **property_names)
            

    def read_token(self, s, add_contexts=False, add_locs=False, 
                   source=None):
        assert chktype(1, s, str)

        TEXT = self.property('TEXT')
        LOC = self.property('LOC')
        CONTEXT = self.property('CONTEXT')
        SENTS = self.property('SENTS')
        TREE = self.property('TREE')

        sentences = re.findall('(?s)\S.*?/\.', s)
        sent_toks = []
        for sent_num, sentence in enumerate(sentences):
            sent_loc = SentIndexLocation(sent_num, source)
            sent_tok = self._sent_reader.read_token(
                sentence, add_contexts=add_contexts,
                add_locs=add_locs, source=sent_loc)
            sent_toks.append(sent_tok)
        tok = Token(**{SENTS: sent_toks})

        # Add context pointers, if requested
        if add_contexts:
            for i, sent_tok in enumerate(tok[SENTS]):
                sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i)

        # Return the finished token.
        return tok
            
    def read_tokens(self, s, source=None):
        return [self.read_token(s, source)]

Example #4

Show file

File: treebank.py Project: ronaldahmed/robot-navigation

    def __init__(self,  **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(
            top_node='S', chunk_node='NP', **property_names)

Example #5

Show file

File: ex1.py Project: LucasCosas/machine-learning

from nltk.tokenreader.tagged import ChunkedTaggedTokenReader
chunked_string = "[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]"
reader = ChunkedTaggedTokenReader(chunk_node='NP', SUBTOKENS='WORDS')
sent_token = reader.read_token(chunked_string)
print sent_token['TREE']