Python NodeStack Exemples, regparser.tree.xml_parser.tree_utils.NodeStack Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : gpo_cfr_tests.py Projet : whytheplatypus/regulations-parser

def test_process_inner_child():
    with XMLBuilder('ROOT') as ctx:
        ctx.HD("Title")
        ctx.P("1. 111. i. iii")
        ctx.STARS()
        ctx.P("A. AAA")
        ctx.child_from_string('<P><E T="03">1.</E> eee</P>')
    node = ctx.xml.xpath('//HD')[0]
    stack = tree_utils.NodeStack()
    gpo_cfr.process_inner_children(stack, node)
    while stack.size() > 1:
        stack.unwind()
    n1 = stack.m_stack[0][0][1]
    assert n1.label == ['1']
    assert len(n1.children) == 1

    n1i = n1.children[0]
    assert n1i.label == ['1', 'i']
    assert n1i.text == 'i. iii'
    assert len(n1i.children) == 1

    n1ia = n1i.children[0]
    assert n1ia.label == ['1', 'i', 'A']
    assert len(n1ia.children) == 1

    n1ia1 = n1ia.children[0]
    assert n1ia1.label == ['1', 'i', 'A', '1']
    assert n1ia1.children == []

Exemple #2

0

Afficher le fichier

Fichier : tree_xml_parser_interpretations_tests.py Projet : govtmirror/regulations-parser-1

    def test_process_inner_child(self):
        xml = """
        <ROOT>
            <HD>Title</HD>
            <P>1. 111. i. iii</P>
            <STARS />
            <P>A. AAA</P>
            <P><E T="03">1.</E> eee</P>
        </ROOT>"""
        node = etree.fromstring(xml).xpath('//HD')[0]
        stack = tree_utils.NodeStack()
        interpretations.process_inner_children(stack, node)
        while stack.size() > 1:
            stack.unwind()
        n1 = stack.m_stack[0][0][1]
        self.assertEqual(['1'], n1.label)
        self.assertEqual(1, len(n1.children))

        n1i = n1.children[0]
        self.assertEqual(['1', 'i'], n1i.label)
        self.assertEqual(n1i.text.strip(), 'i. iii')
        self.assertEqual(1, len(n1i.children))

        n1iA = n1i.children[0]
        self.assertEqual(['1', 'i', 'A'], n1iA.label)
        self.assertEqual(1, len(n1iA.children))

        n1iA1 = n1iA.children[0]
        self.assertEqual(['1', 'i', 'A', '1'], n1iA1.label)
        self.assertEqual(0, len(n1iA1.children))

Exemple #3

0

Afficher le fichier

Fichier : tree_utils_tests.py Projet : whytheplatypus/regulations-parser

    def test_collapse_stack(self):
        """collapse() is a helper method which wraps up all of the node
        stack's nodes with a bow"""
        m_stack = tree_utils.NodeStack()
        m_stack.add(0, Node(label=['272']))
        m_stack.add(1, Node(label=['11']))
        m_stack.add(2, Node(label=['a']))
        m_stack.add(3, Node(label=['1']))
        m_stack.add(3, Node(label=['2']))
        m_stack.add(2, Node(label=['b']))

        reg = m_stack.collapse()
        self.assertEqual(reg.label, ['272'])
        self.assertEqual(len(reg.children), 1)

        section = reg.children[0]
        self.assertEqual(section.label, ['272', '11'])
        self.assertEqual(len(section.children), 2)

        a, b = section.children
        self.assertEqual(b.label, ['272', '11', 'b'])
        self.assertEqual(len(b.children), 0)
        self.assertEqual(a.label, ['272', '11', 'a'])
        self.assertEqual(len(a.children), 2)

        a1, a2 = a.children
        self.assertEqual(a1.label, ['272', '11', 'a', '1'])
        self.assertEqual(len(a1.children), 0)
        self.assertEqual(a2.label, ['272', '11', 'a', '2'])
        self.assertEqual(len(a2.children), 0)

Exemple #4

0

Afficher le fichier

Fichier : appendices.py Projet : sihaysistema/regulations-parser

    def process(self, appendix, part):
        self.m_stack = tree_utils.NodeStack()

        self.part = part
        self.paragraph_count = 0
        self.header_count = 0
        self.depth = None
        self.appendix_letter = None
        # holds collections of nodes until their depth is determined
        self.nodes = []

        self.set_letter(appendix)
        remove_toc(appendix, self.appendix_letter)

        def is_subhead(tag, text):
            initial = initial_marker(text)
            return ((tag == 'HD' and (not initial or '.' in initial[1]))
                    or (tag in ('P', 'FP') and title_label_pair(
                        text, self.appendix_letter, self.part)))

        for child in appendix.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
                    or child.tag == 'RESERVED'):
                self.end_group()
                self.hed(part, text)
            elif is_subhead(child.tag, text):
                self.end_group()
                self.subheader(child, text)
            elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
                text = self.insert_dashes(child, text)
                self.paragraph_with_marker(
                    text, tree_utils.get_node_text_tags_preserved(child))
            elif child.tag == 'SEQUENCE':
                old_depth = self.depth
                self.end_group()
                self.depth = old_depth
                self.process_sequence(child)
            elif child.tag in ('P', 'FP'):
                text = self.insert_dashes(child, text)
                self.paragraph_no_marker(text)
            elif child.tag == 'GPH':
                self.graphic(child)
            elif child.tag == 'GPOTABLE':
                self.table(child)
            elif child.tag in ('NOTE', 'NOTES'):
                self.fence(child, 'note')
            elif child.tag == 'CODE':
                self.fence(child, child.get('LANGUAGE', 'code'))

        self.end_group()
        while self.m_stack.size() > 1:
            self.m_stack.unwind()

        if self.m_stack.m_stack[0]:
            return self.m_stack.m_stack[0][0][1]

Exemple #5

0

Afficher le fichier

Fichier : appendices.py Projet : govtmirror/regulations-parser-1

    def __init__(self, part):
        self.m_stack = tree_utils.NodeStack()

        self.part = part
        self.paragraph_counter = 0
        self.header_count = 0
        self.depth = 0
        self.appendix_letter = None
        # holds collections of nodes until their depth is determined
        self.nodes = []

Exemple #6

0

Afficher le fichier

Fichier : gpo_cfr_tests.py Projet : whytheplatypus/regulations-parser

def test_process_inner_child_has_citation():
    with XMLBuilder() as ctx:
        ctx.HD("Title")
        ctx.P("1. Something something see comment 22(a)-2.i. please")
    node = ctx.xml.xpath('//HD')[0]
    stack = tree_utils.NodeStack()
    gpo_cfr.process_inner_children(stack, node)
    while stack.size() > 1:
        stack.unwind()
    tree = stack.m_stack[0][0][1]
    assert tree.children == []

Exemple #7

0

Afficher le fichier

Fichier : gpo_cfr_tests.py Projet : whytheplatypus/regulations-parser

def test_process_inner_child_incorrect_xml():
    with XMLBuilder('ROOT') as ctx:
        ctx.HD("Title")
        ctx.child_from_string('<P><E T="03">1.</E> 111</P>')
        ctx.P("i. iii")
        ctx.child_from_string('<P><E T="03">2.</E> 222 Incorrect Content</P>')
    node = ctx.xml.xpath('//HD')[0]
    stack = tree_utils.NodeStack()
    gpo_cfr.process_inner_children(stack, node)
    while stack.size() > 1:
        stack.unwind()
    assert len(stack.m_stack[0]) == 2

Exemple #8

0

Afficher le fichier

Fichier : paragraph_processor.py Projet : anthonygarvan/regulations-parser

 def build_hierarchy(self, root, nodes, depths):
     """Given a root node, a flat list of child nodes, and a list of
     depths, build a node hierarchy around the root"""
     stack = tree_utils.NodeStack()
     stack.add(0, root)
     for node, depth_info in zip(nodes, depths):
         node.label = [mtypes.deemphasize(l) for l in node.label]
         self.replace_markerless(stack, node, depth_info.depth + 1)
         self.carry_label_to_children(node)
         if depth_info.typ != mtypes.stars:
             stack.add(1 + depth_info.depth, node)
     return stack.collapse()

Exemple #9

0

Afficher le fichier

Fichier : interpretations.py Projet : sihaysistema/regulations-parser

def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:  # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label, title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)

    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]

Exemple #10

0

Afficher le fichier

Fichier : tree_xml_parser_interpretations_tests.py Projet : govtmirror/regulations-parser-1

 def test_process_inner_child_has_citation(self):
     xml = """
     <ROOT>
         <HD>Title</HD>
         <P>1. Something something see comment 22(a)-2.i. please</P>
     </ROOT>"""
     node = etree.fromstring(xml).xpath('//HD')[0]
     stack = tree_utils.NodeStack()
     interpretations.process_inner_children(stack, node)
     while stack.size() > 1:
         stack.unwind()
     tree = stack.m_stack[0][0][1]
     self.assertEqual(0, len(tree.children))

Exemple #11

0

Afficher le fichier

Fichier : tree_xml_parser_interpretations_tests.py Projet : govtmirror/regulations-parser-1

 def test_process_inner_child_incorrect_xml(self):
     xml = """
     <ROOT>
         <HD>Title</HD>
         <P><E T="03">1.</E> 111</P>
         <P>i. iii</P>
         <P><E T="03">2.</E> 222 Incorrect Content</P>
     </ROOT>"""
     node = etree.fromstring(xml).xpath('//HD')[0]
     stack = tree_utils.NodeStack()
     interpretations.process_inner_children(stack, node)
     while stack.size() > 1:
         stack.unwind()
     self.assertEqual(2, len(stack.m_stack[0]))

Exemple #12

0

Afficher le fichier

Fichier : tree_utils_tests.py Projet : sihaysistema/regulations-parser

    def test_unwind_stack(self):
        level_one_n = Node(label=['272'])
        level_two_n = Node(label=['a'])

        m_stack = tree_utils.NodeStack()
        m_stack.push_last((1, level_one_n))
        m_stack.add(2, level_two_n)

        self.assertEquals(m_stack.size(), 2)
        m_stack.unwind()

        self.assertEquals(m_stack.size(), 1)

        n = m_stack.pop()[0][1]
        self.assertEqual(n.children[0].label, ['272', 'a'])

Exemple #13

0

Afficher le fichier

Fichier : gpo_cfr_tests.py Projet : whytheplatypus/regulations-parser

def test_process_inner_child_no_marker():
    with XMLBuilder() as ctx:
        ctx.HD("Title")
        ctx.P("1. 111")
        ctx.P("i. iii")
        ctx.P("Howdy Howdy")
    node = ctx.xml.xpath('//HD')[0]
    stack = tree_utils.NodeStack()
    gpo_cfr.process_inner_children(stack, node)
    while stack.size() > 1:
        stack.unwind()
    i1 = stack.m_stack[0][0][1]
    assert len(i1.children) == 1
    i1i = i1.children[0]
    assert i1i.children == []
    assert i1i.text == "i. iii\n\nHowdy Howdy"

Exemple #14

0

Afficher le fichier

Fichier : gpo_cfr_tests.py Projet : whytheplatypus/regulations-parser

def test_process_inner_child_space():
    with XMLBuilder('ROOT') as ctx:
        ctx.HD("Title")
        ctx.P("1. 111")
        ctx.P("i. See country A. Not that country")
    node = ctx.xml.xpath('//HD')[0]
    stack = tree_utils.NodeStack()
    gpo_cfr.process_inner_children(stack, node)
    while stack.size() > 1:
        stack.unwind()
    n1 = stack.m_stack[0][0][1]
    assert n1.label == ['1']
    assert len(n1.children) == 1

    n1i = n1.children[0]
    assert n1i.label == ['1', 'i']
    assert n1i.children == []

Exemple #15

0

Afficher le fichier

Fichier : gpo_cfr_tests.py Projet : whytheplatypus/regulations-parser

def test_process_inner_child_stars_and_inline():
    with XMLBuilder() as ctx:
        ctx.HD("Title")
        ctx.STARS()
        ctx.P("2. Content. * * *")
        ctx.STARS()
        ctx.P("xi. Content")
        ctx.STARS()
    node = ctx.xml.xpath('//HD')[0]
    stack = tree_utils.NodeStack()
    gpo_cfr.process_inner_children(stack, node)
    while stack.size() > 1:
        stack.unwind()
    tree = stack.m_stack[0][0][1]
    assert tree.label == ['2']
    assert len(tree.children) == 1
    assert tree.children[0].label == ['2', 'xi']
    assert tree.children[0].children == []

Exemple #16

0

Afficher le fichier

Fichier : tree_xml_parser_interpretations_tests.py Projet : govtmirror/regulations-parser-1

 def test_process_inner_child_no_marker(self):
     xml = """
         <ROOT>
             <HD>Title</HD>
             <P>1. 111</P>
             <P>i. iii</P>
             <P>Howdy Howdy</P>
         </ROOT>"""
     node = etree.fromstring(xml).xpath('//HD')[0]
     stack = tree_utils.NodeStack()
     interpretations.process_inner_children(stack, node)
     while stack.size() > 1:
         stack.unwind()
     i1 = stack.m_stack[0][0][1]
     self.assertEqual(1, len(i1.children))
     i1i = i1.children[0]
     self.assertEqual(0, len(i1i.children))
     self.assertEqual(i1i.text.strip(), "i. iii\n\nHowdy Howdy")

Exemple #17

0

Afficher le fichier

Fichier : gpo_cfr_tests.py Projet : whytheplatypus/regulations-parser

def test_process_inner_child_collapsed_i():
    with XMLBuilder() as ctx:
        ctx.HD("Title")
        ctx.child_from_string(
            '<P>1. <E T="03">Keyterm text</E> i. Content content</P>')
        ctx.P("ii. Other stuff")
    node = ctx.xml.xpath('//HD')[0]
    stack = tree_utils.NodeStack()
    gpo_cfr.process_inner_children(stack, node)
    while stack.size() > 1:
        stack.unwind()
    tree = stack.m_stack[0][0][1]
    assert tree.label == ['1']
    assert len(tree.children) == 2
    assert tree.children[0].label == ['1', 'i']
    assert tree.children[0].children == []
    assert tree.children[1].label == ['1', 'ii']
    assert tree.children[1].children == []

Exemple #18

0

Afficher le fichier

Fichier : tree_xml_parser_interpretations_tests.py Projet : govtmirror/regulations-parser-1

 def test_process_inner_child_collapsed_i(self):
     xml = """
     <ROOT>
         <HD>Title</HD>
         <P>1. <E T="03">Keyterm text</E> i. Content content</P>
         <P>ii. Other stuff</P>
     </ROOT>"""
     node = etree.fromstring(xml).xpath('//HD')[0]
     stack = tree_utils.NodeStack()
     interpretations.process_inner_children(stack, node)
     while stack.size() > 1:
         stack.unwind()
     tree = stack.m_stack[0][0][1]
     self.assertEqual(['1'], tree.label)
     self.assertEqual(2, len(tree.children))
     self.assertEqual(['1', 'i'], tree.children[0].label)
     self.assertEqual(0, len(tree.children[0].children))
     self.assertEqual(['1', 'ii'], tree.children[1].label)
     self.assertEqual(0, len(tree.children[1].children))

Exemple #19

0

Afficher le fichier

Fichier : tree_xml_parser_interpretations_tests.py Projet : govtmirror/regulations-parser-1

    def test_process_inner_child_space(self):
        xml = """
        <ROOT>
            <HD>Title</HD>
            <P>1. 111</P>
            <P>i. See country A. Not that country</P>
        </ROOT>"""
        node = etree.fromstring(xml).xpath('//HD')[0]
        stack = tree_utils.NodeStack()
        interpretations.process_inner_children(stack, node)
        while stack.size() > 1:
            stack.unwind()
        n1 = stack.m_stack[0][0][1]
        self.assertEqual(['1'], n1.label)
        self.assertEqual(1, len(n1.children))

        n1i = n1.children[0]
        self.assertEqual(['1', 'i'], n1i.label)
        self.assertEqual(0, len(n1i.children))

Exemple #20

0

Afficher le fichier

Fichier : tree_xml_parser_interpretations_tests.py Projet : govtmirror/regulations-parser-1

 def test_process_inner_child_stars_and_inline(self):
     xml = """
     <ROOT>
         <HD>Title</HD>
         <STARS />
         <P>2. Content. * * *</P>
         <STARS />
         <P>xi. Content</P>
         <STARS />
     </ROOT>"""
     node = etree.fromstring(xml).xpath('//HD')[0]
     stack = tree_utils.NodeStack()
     interpretations.process_inner_children(stack, node)
     while stack.size() > 1:
         stack.unwind()
     tree = stack.m_stack[0][0][1]
     self.assertEqual(['2'], tree.label)
     self.assertEqual(1, len(tree.children))
     self.assertEqual(['2', 'xi'], tree.children[0].label)
     self.assertEqual(0, len(tree.children[0].children))

Exemple #21

0

Afficher le fichier

Fichier : tree_xml_parser_appendices_tests.py Projet : pombreda/regulations-parser

 def setUp(self):
     self.ap = appendices.AppendixProcessor()
     self.ap.paragraph_counter = 0
     self.ap.depth = 0
     self.ap.m_stack = tree_utils.NodeStack()
     self.ap.nodes = []

Exemple #22

0

Afficher le fichier

Fichier : reg_text.py Projet : adderall/regulations-parser

def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy_flag = False
    if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[
            reg_part]:
        manual_hierarchy_flag = True

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [
        ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS']
    ]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            # is this a bunch of definitions that don't have numbers next to them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    #TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    #nodes[-1].children.append(n)
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy_flag:
        depths = derive_depths([n.label[0] for n in nodes], [
            rules.depth_type_order([
                mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper,
                mtypes.em_ints, mtypes.em_roman
            ])
        ])

    if not manual_hierarchy_flag and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy_flag:
        logging.warning('Using manual depth hierarchy.')
        depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker]
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!'
                ' ({0} nodes but {1} provided)'.format(len(nodes),
                                                       len(depths)))

    elif nodes and not manual_hierarchy_flag:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [n.label[0] for n in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text,
                         label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes

Exemple #23

0

Afficher le fichier

Fichier : reg_text.py Projet : anselmbradford/regulations-parser

def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []
    # Collect paragraph markers and section text (intro text for the
    # section)
    for ch in filter(lambda ch: ch.tag in ('P', 'STARS'),
                     section_xml.getchildren()):
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)
            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    m_stack = tree_utils.NodeStack()
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)
        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes