Exemple #1
0
    def _chunk_parse(self, grammar=None, top_node='record', trace=0, **kwargs):
        """
        Returns an element tree structure corresponding to a toolbox data file
        parsed according to the chunk grammar.
        
        @type grammar: C{string}
        @param grammar: Contains the chunking rules used to parse the 
        database.  See L{chunk.RegExp} for documentation.
        @type top_node: C{string}
        @param top_node: The node value that should be used for the
            top node of the chunk structure.
        @type trace: C{int}
        @param trace: The level of tracing that should be used when
            parsing a text.  C{0} will generate no tracing output;
            C{1} will generate normal tracing output; and C{2} or
            higher will generate verbose tracing output.
        @type kwargs: C{dictionary}
        @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()}
        @rtype:   C{ElementTree._ElementInterface}
        @return:  Contents of toolbox data parsed according to the rules in grammar
        """
        from nltk import chunk
        from nltk.parse import Tree

        cp = chunk.RegexpParser(grammar, top_node=top_node, trace=trace)
        db = self.parse(**kwargs)
        tb_etree = Element('toolbox_data')
        header = db.find('header')
        tb_etree.append(header)
        for record in db.findall('record'):
            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
            tb_etree.append(self._tree2etree(parsed))
        return tb_etree
Exemple #2
0
    def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs):
        """
        Returns an element tree structure corresponding to a toolbox data file
        parsed according to the chunk grammar.

        :type grammar: str
        :param grammar: Contains the chunking rules used to parse the
            database.  See ``chunk.RegExp`` for documentation.
        :type root_label: str
        :param root_label: The node value that should be used for the
            top node of the chunk structure.
        :type trace: int
        :param trace: The level of tracing that should be used when
            parsing a text.  ``0`` will generate no tracing output;
            ``1`` will generate normal tracing output; and ``2`` or
            higher will generate verbose tracing output.
        :type kwargs: dict
        :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()``
        :rtype: ElementTree._ElementInterface
        """
        from nltk import chunk
        from nltk.tree import Tree

        cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
        db = self.parse(**kwargs)
        tb_etree = Element("toolbox_data")
        header = db.find("header")
        tb_etree.append(header)
        for record in db.findall("record"):
            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
            tb_etree.append(self._tree2etree(parsed))
        return tb_etree
Exemple #3
0
    def chunk_parse(self,
                    grammar,
                    no_blanks=True,
                    incomplete='record',
                    **kwargs):
        """
        Returns an element tree structure corresponding to a toolbox data file
        parsed according to the chunk grammar.
        
        @type grammar: string
        @param grammar: Contains the chunking rules used to parse the 
        database.  See L{chunk.RegExp} for documentation.
        @type no_blanks: boolean
        @param no_blanks: blank fields that are not important to the structure are deleted
        @type kwargs: keyword arguments dictionary
        @param incomplete: name of element used if parse doesn't result in one toplevel element
        @rtype: string
        @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()}
        @rtype:   ElementTree._ElementInterface
        @return:  Contents of toolbox data parsed according to the rules in grammar
        """
        from nltk import chunk
        from nltk.parse import Tree

        cp = chunk.RegexpParser(grammar)
        db = self.parse(**kwargs)
        tb_etree = Element('toolbox_data')
        header = db.find('header')
        tb_etree.append(header)
        for record in db.findall('record'):
            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
            top = parsed[0]
            if not isinstance(top, Tree) or len(parsed) != 1:
                # didn't get a full parse
                parsed.node = incomplete
                top = parsed
            tb_etree.append(self._tree2etree(top, no_blanks))
        return tb_etree
Exemple #4
0
def demo():
    """
    A demonstration for the C{RegexpChunkParser} class.  A single text is
    parsed with four different chunk parsers, using a variety of rules
    and strategies.
    """

    from nltk import chunk, Tree

    text = """\
    [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
    [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
    [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
    """

    print '*' * 75
    print 'Evaluation text:'
    print text
    print '*' * 75
    print

    grammar = r"""
    NP:                   # NP stage
      {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
      {<NNP>+}            # chunk proper nouns
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

    grammar = r"""
    NP:
      {<.*>}              # start by chunking each tag
      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

    grammar = r"""
    NP: {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
    VP: {<TO>?<VB.*>}       # VP = verb words
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

    grammar = r"""
    NP: {<.*>*}             # start by chunking everything
        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
        <.*>}{<DT>          # separate on determiners
    PP: {<IN><NP>}          # PP = preposition + noun phrase
    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

    # Evaluation

    from nltk.corpus import conll2000

    print
    print "Demonstration of empty grammar:"

    cp = chunk.RegexpParser("")
    print chunk.accuracy(
        cp, conll2000.chunked_sents('test.txt', chunk_types=('NP', )))

    print
    print "Demonstration of accuracy evaluation using CoNLL tags:"

    grammar = r"""
    NP:
      {<.*>}              # start by chunking each tag
      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
    """
    cp = chunk.RegexpParser(grammar)
    print chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5])

    print
    print "Demonstration of tagged token input"

    grammar = r"""
    NP: {<.*>*}             # start by chunking everything
        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
        <.*>}{<DT>          # separate on determiners
    PP: {<IN><NP>}          # PP = preposition + noun phrase
    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
    """
    cp = chunk.RegexpParser(grammar)
    print cp.parse([("the", "DT"), ("little", "JJ"), ("cat", "NN"),
                    ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"),
                    (".", ".")])
Exemple #5
0
from nltk import chunk
from nltk.corpus import conll2000

cp = chunk.RegexpParser("NP: {<DT><NN>}")
print chunk.accuracy(cp,
                     conll2000.chunked_sents('test.txt', chunk_types=('NP', )))

gold_tree = conll2000.chunked_sents('train.txt', chunk_types=('NP', ))[1]
print gold_tree
print cp.parse(gold_tree.flatten())