Esempio n. 1
0
    def __init__(self, **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(top_node='S',
                                                     chunk_node='NP',
                                                     **property_names)
Esempio n. 2
0
class TreebankTaggedTokenReader(TokenReaderI, PropertyIndirectionMixIn):
    """
    A token reader that reas the treebank tagged-file format into a
    token.  In this format:

      - Paragraphs are separated by lines of C{'='} characters.
      - Sentences are separated by words tagged as sentence-final
        punctuation (e.g., C{'./.'}).
      - NP chunk structure is encoded with square brackets (C{[...]}).
      - Words are separated by whitespace or square brackets.
      - Each word has the form C{I{text}/i{tag}}, where C{I{text}}
        is the word's text, and C{I{tag}} is its tag.

    In the returned token:
    
      - The returned token describes a single document.
      - The document's C{SENTS} property contains a list of
        sentence tokens.
          - Each sentence token's C{WORDS} property contains a list of
            word tokens.
            - Each word token's C{TEXT} property contains the word's
              text.
            - Each word token's C{TAG} property contains the word's
              tag.
            - Depending on the arguments to the reader's constructor,
              each word token may also define the C{LOC} and
              C{CONTEXT} properties.
          - Each sentence token's C{TREE} property contains the
            chunk structures in the text.  In the case of the Treebank,
            these chunk structures were generated by a stochastic NP
            chunker as part of the PARTS preprocessor, and \"are best
            ignored.\"
    """
    def __init__(self, **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(top_node='S',
                                                     chunk_node='NP',
                                                     **property_names)

    def read_token(self, s, add_contexts=False, add_locs=False, source=None):
        assert chktype(1, s, str)

        TEXT = self.property('TEXT')
        LOC = self.property('LOC')
        CONTEXT = self.property('CONTEXT')
        SENTS = self.property('SENTS')
        TREE = self.property('TREE')

        sentences = re.findall('(?s)\S.*?/\.', s)
        sent_toks = []
        for sent_num, sentence in enumerate(sentences):
            sent_loc = SentIndexLocation(sent_num, source)
            sent_tok = self._sent_reader.read_token(sentence,
                                                    add_contexts=add_contexts,
                                                    add_locs=add_locs,
                                                    source=sent_loc)
            sent_toks.append(sent_tok)
        tok = Token(**{SENTS: sent_toks})

        # Add context pointers, if requested
        if add_contexts:
            for i, sent_tok in enumerate(tok[SENTS]):
                sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i)

        # Return the finished token.
        return tok

    def read_tokens(self, s, source=None):
        return [self.read_token(s, source)]
Esempio n. 3
0
class TreebankTaggedTokenReader(TokenReaderI, PropertyIndirectionMixIn):
    """
    A token reader that reas the treebank tagged-file format into a
    token.  In this format:

      - Paragraphs are separated by lines of C{'='} characters.
      - Sentences are separated by words tagged as sentence-final
        punctuation (e.g., C{'./.'}).
      - NP chunk structure is encoded with square brackets (C{[...]}).
      - Words are separated by whitespace or square brackets.
      - Each word has the form C{I{text}/i{tag}}, where C{I{text}}
        is the word's text, and C{I{tag}} is its tag.

    In the returned token:
    
      - The returned token describes a single document.
      - The document's C{SENTS} property contains a list of
        sentence tokens.
          - Each sentence token's C{WORDS} property contains a list of
            word tokens.
            - Each word token's C{TEXT} property contains the word's
              text.
            - Each word token's C{TAG} property contains the word's
              tag.
            - Depending on the arguments to the reader's constructor,
              each word token may also define the C{LOC} and
              C{CONTEXT} properties.
          - Each sentence token's C{TREE} property contains the
            chunk structures in the text.  In the case of the Treebank,
            these chunk structures were generated by a stochastic NP
            chunker as part of the PARTS preprocessor, and \"are best
            ignored.\"
    """
    def __init__(self,  **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(
            top_node='S', chunk_node='NP', **property_names)
            

    def read_token(self, s, add_contexts=False, add_locs=False, 
                   source=None):
        assert chktype(1, s, str)

        TEXT = self.property('TEXT')
        LOC = self.property('LOC')
        CONTEXT = self.property('CONTEXT')
        SENTS = self.property('SENTS')
        TREE = self.property('TREE')

        sentences = re.findall('(?s)\S.*?/\.', s)
        sent_toks = []
        for sent_num, sentence in enumerate(sentences):
            sent_loc = SentIndexLocation(sent_num, source)
            sent_tok = self._sent_reader.read_token(
                sentence, add_contexts=add_contexts,
                add_locs=add_locs, source=sent_loc)
            sent_toks.append(sent_tok)
        tok = Token(**{SENTS: sent_toks})

        # Add context pointers, if requested
        if add_contexts:
            for i, sent_tok in enumerate(tok[SENTS]):
                sent_tok[CONTEXT] = SubtokenContextPointer(tok, SENTS, i)

        # Return the finished token.
        return tok
            
    def read_tokens(self, s, source=None):
        return [self.read_token(s, source)]
Esempio n. 4
0
    def __init__(self,  **property_names):
        PropertyIndirectionMixIn.__init__(self, **property_names)

        # A token reader for processing sentences.
        self._sent_reader = ChunkedTaggedTokenReader(
            top_node='S', chunk_node='NP', **property_names)
Esempio n. 5
0
from nltk.tokenreader.tagged import ChunkedTaggedTokenReader
chunked_string = "[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]"
reader = ChunkedTaggedTokenReader(chunk_node='NP', SUBTOKENS='WORDS')
sent_token = reader.read_token(chunked_string)
print sent_token['TREE']