コード例 #1
0
ファイル: data.py プロジェクト: DrDub/icsisumm
    def chunk_parse(self, grammar, no_blanks=True, incomplete='record', **kwargs):
        """
        Returns an element tree structure corresponding to a toolbox data file
        parsed according to the chunk grammar.
        
        @type grammar: string
        @param grammar: Contains the chunking rules used to parse the 
        database.  See L{chunk.RegExp} for documentation.
        @type no_blanks: boolean
        @param no_blanks: blank fields that are not important to the structure are deleted
        @type kwargs: keyword arguments dictionary
        @param incomplete: name of element used if parse doesn't result in one toplevel element
        @rtype: string
        @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()}
        @rtype:   ElementTree._ElementInterface
        @return:  Contents of toolbox data parsed according to the rules in grammar
        """
        from nltk import chunk
        from nltk.parse import Tree

        cp = chunk.RegexpParser(grammar)
        db = self.parse(**kwargs)
        tb_etree = Element('toolbox_data')
        header = db.find('header')
        tb_etree.append(header)
        for record in db.findall('record'):
            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
            top = parsed[0]
            if not isinstance(top, Tree) or len(parsed) != 1:
                # didn't get a full parse
                parsed.node = incomplete
                top = parsed
            tb_etree.append(self._tree2etree(top, no_blanks))
        return tb_etree
コード例 #2
0
    def _tree2etree(self, parent):
        from nltk.parse import Tree

        root = Element(parent.node)
        for child in parent:
            if isinstance(child, Tree):
                root.append(self._tree2etree(child))
            else:
                text, tag = child
                e = SubElement(root, tag)
                e.text = text
        return root
コード例 #3
0
ファイル: toolbox.py プロジェクト: wrand/tweater
def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
    """Return a string with a standard format representation of the toolbox
    data in tree (tree can be a toolbox database or a single record).
    
    @param tree: flat representation of toolbox data (whole database or single record)
    @type tree: C{ElementTree._ElementInterface}
    @param encoding: Name of an encoding to use.
    @type encoding: C{string}
    @param errors: Error handling scheme for codec. Same as the C{encode} 
        inbuilt string method.
    @type errors: C{string}
    @param unicode_fields:
    @type unicode_fields: C{dictionary} or C{set} of field names
    @rtype:   C{string}
    @return:  C{string} using standard format markup
    """
    if tree.tag == 'record':
        root = Element('toolbox_data')
        root.append(tree)
        tree = root

    if tree.tag != 'toolbox_data':
        raise ValueError, "not a toolbox_data element structure"
    if encoding is None and unicode_fields is not None:
        raise ValueError, \
            "if encoding is not specified then neither should unicode_fields"
    l = []
    for rec in tree:
        l.append('\n')
        for field in rec:
            mkr = field.tag
            value = field.text
            if encoding is not None:
                if unicode_fields is not None and mkr in unicode_fields:
                    cur_encoding = 'utf8'
                else:
                    cur_encoding = encoding
                if re.search(_is_value, value):
                    l.append((u"\\%s %s\n" % (mkr, value)).encode(
                        cur_encoding, errors))
                else:
                    l.append((u"\\%s%s\n" % (mkr, value)).encode(
                        cur_encoding, errors))
            else:
                if re.search(_is_value, value):
                    l.append("\\%s %s\n" % (mkr, value))
                else:
                    l.append("\\%s%s\n" % (mkr, value))
    return ''.join(l[1:])
コード例 #4
0
ファイル: data.py プロジェクト: sushengyang/NLP-project
def inline_char_coded_text(tag,  s):
    """return an element with the char coded text converted to span elements with appropriate attributes
        
    @param tag: tag for returned element
    @type tag: C{String}
    @param s: element corresponding to an MDF field. This is modified by the function. 
        It may already have 'span' subelements corresponding to character styled text earlier in the MDF field.
    @type s: C{String}
    @return: an element with the character code styles converted to spans elements.
    @rtype: C{ElementTree._ElementInterface}
    """
    elem = Element(tag)
    elem.text= s
    inline_char_coded_elem(elem)
    return elem
コード例 #5
0
ファイル: toolbox.py プロジェクト: jparise/haitwu-appengine
def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
    """Return a string with a standard format representation of the toolbox
    data in tree (tree can be a toolbox database or a single record).
    
    @param tree: flat representation of toolbox data (whole database or single record)
    @type tree: C{ElementTree._ElementInterface}
    @param encoding: Name of an encoding to use.
    @type encoding: C{string}
    @param errors: Error handling scheme for codec. Same as the C{encode} 
        inbuilt string method.
    @type errors: C{string}
    @param unicode_fields:
    @type unicode_fields: C{dictionary} or C{set} of field names
    @rtype:   C{string}
    @return:  C{string} using standard format markup
    """
    if tree.tag == 'record':
        root = Element('toolbox_data')
        root.append(tree)
        tree = root

    if tree.tag != 'toolbox_data':
        raise ValueError, "not a toolbox_data element structure"
    if encoding is None and unicode_fields is not None:
        raise ValueError, \
            "if encoding is not specified then neither should unicode_fields"
    l = []
    for rec in tree:
        l.append('\n')
        for field in rec:
            mkr = field.tag
            value = field.text
            if encoding is not None:
                if unicode_fields is not None and mkr in unicode_fields:
                    cur_encoding = 'utf8'
                else:
                    cur_encoding = encoding
                if re.search(_is_value, value):
                    l.append((u"\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors))
                else:
                    l.append((u"\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors))
            else:
                if re.search(_is_value, value):
                    l.append("\\%s %s\n" % (mkr, value))
                else:
                    l.append("\\%s%s\n" % (mkr, value))
    return ''.join(l[1:])
コード例 #6
0
    def _chunk_parse(self, grammar=None, top_node='record', trace=0, **kwargs):
        """
        Returns an element tree structure corresponding to a toolbox data file
        parsed according to the chunk grammar.
        
        @type grammar: C{string}
        @param grammar: Contains the chunking rules used to parse the 
        database.  See L{chunk.RegExp} for documentation.
        @type top_node: C{string}
        @param top_node: The node value that should be used for the
            top node of the chunk structure.
        @type trace: C{int}
        @param trace: The level of tracing that should be used when
            parsing a text.  C{0} will generate no tracing output;
            C{1} will generate normal tracing output; and C{2} or
            higher will generate verbose tracing output.
        @type kwargs: C{dictionary}
        @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()}
        @rtype:   C{ElementTree._ElementInterface}
        @return:  Contents of toolbox data parsed according to the rules in grammar
        """
        from nltk import chunk
        from nltk.parse import Tree

        cp = chunk.RegexpParser(grammar, top_node=top_node, trace=trace)
        db = self.parse(**kwargs)
        tb_etree = Element('toolbox_data')
        header = db.find('header')
        tb_etree.append(header)
        for record in db.findall('record'):
            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
            tb_etree.append(self._tree2etree(parsed))
        return tb_etree
コード例 #7
0
ファイル: data.py プロジェクト: steven-cutting/icsisumm
    def chunk_parse(self,
                    grammar,
                    no_blanks=True,
                    incomplete='record',
                    **kwargs):
        """
        Returns an element tree structure corresponding to a toolbox data file
        parsed according to the chunk grammar.
        
        @type grammar: string
        @param grammar: Contains the chunking rules used to parse the 
        database.  See L{chunk.RegExp} for documentation.
        @type no_blanks: boolean
        @param no_blanks: blank fields that are not important to the structure are deleted
        @type kwargs: keyword arguments dictionary
        @param incomplete: name of element used if parse doesn't result in one toplevel element
        @rtype: string
        @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()}
        @rtype:   ElementTree._ElementInterface
        @return:  Contents of toolbox data parsed according to the rules in grammar
        """
        from nltk import chunk
        from nltk.parse import Tree

        cp = chunk.RegexpParser(grammar)
        db = self.parse(**kwargs)
        tb_etree = Element('toolbox_data')
        header = db.find('header')
        tb_etree.append(header)
        for record in db.findall('record'):
            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
            top = parsed[0]
            if not isinstance(top, Tree) or len(parsed) != 1:
                # didn't get a full parse
                parsed.node = incomplete
                top = parsed
            tb_etree.append(self._tree2etree(top, no_blanks))
        return tb_etree