def test_process_inner_child(): with XMLBuilder('ROOT') as ctx: ctx.HD("Title") ctx.P("1. 111. i. iii") ctx.STARS() ctx.P("A. AAA") ctx.child_from_string('<P><E T="03">1.</E> eee</P>') node = ctx.xml.xpath('//HD')[0] stack = tree_utils.NodeStack() gpo_cfr.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() n1 = stack.m_stack[0][0][1] assert n1.label == ['1'] assert len(n1.children) == 1 n1i = n1.children[0] assert n1i.label == ['1', 'i'] assert n1i.text == 'i. iii' assert len(n1i.children) == 1 n1ia = n1i.children[0] assert n1ia.label == ['1', 'i', 'A'] assert len(n1ia.children) == 1 n1ia1 = n1ia.children[0] assert n1ia1.label == ['1', 'i', 'A', '1'] assert n1ia1.children == []
def test_process_inner_child(self): xml = """ <ROOT> <HD>Title</HD> <P>1. 111. i. iii</P> <STARS /> <P>A. AAA</P> <P><E T="03">1.</E> eee</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() n1 = stack.m_stack[0][0][1] self.assertEqual(['1'], n1.label) self.assertEqual(1, len(n1.children)) n1i = n1.children[0] self.assertEqual(['1', 'i'], n1i.label) self.assertEqual(n1i.text.strip(), 'i. iii') self.assertEqual(1, len(n1i.children)) n1iA = n1i.children[0] self.assertEqual(['1', 'i', 'A'], n1iA.label) self.assertEqual(1, len(n1iA.children)) n1iA1 = n1iA.children[0] self.assertEqual(['1', 'i', 'A', '1'], n1iA1.label) self.assertEqual(0, len(n1iA1.children))
def test_collapse_stack(self): """collapse() is a helper method which wraps up all of the node stack's nodes with a bow""" m_stack = tree_utils.NodeStack() m_stack.add(0, Node(label=['272'])) m_stack.add(1, Node(label=['11'])) m_stack.add(2, Node(label=['a'])) m_stack.add(3, Node(label=['1'])) m_stack.add(3, Node(label=['2'])) m_stack.add(2, Node(label=['b'])) reg = m_stack.collapse() self.assertEqual(reg.label, ['272']) self.assertEqual(len(reg.children), 1) section = reg.children[0] self.assertEqual(section.label, ['272', '11']) self.assertEqual(len(section.children), 2) a, b = section.children self.assertEqual(b.label, ['272', '11', 'b']) self.assertEqual(len(b.children), 0) self.assertEqual(a.label, ['272', '11', 'a']) self.assertEqual(len(a.children), 2) a1, a2 = a.children self.assertEqual(a1.label, ['272', '11', 'a', '1']) self.assertEqual(len(a1.children), 0) self.assertEqual(a2.label, ['272', '11', 'a', '2']) self.assertEqual(len(a2.children), 0)
def process(self, appendix, part): self.m_stack = tree_utils.NodeStack() self.part = part self.paragraph_count = 0 self.header_count = 0 self.depth = None self.appendix_letter = None # holds collections of nodes until their depth is determined self.nodes = [] self.set_letter(appendix) remove_toc(appendix, self.appendix_letter) def is_subhead(tag, text): initial = initial_marker(text) return ((tag == 'HD' and (not initial or '.' in initial[1])) or (tag in ('P', 'FP') and title_label_pair( text, self.appendix_letter, self.part))) for child in appendix.getchildren(): text = tree_utils.get_node_text(child, add_spaces=True).strip() if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED') or child.tag == 'RESERVED'): self.end_group() self.hed(part, text) elif is_subhead(child.tag, text): self.end_group() self.subheader(child, text) elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'): text = self.insert_dashes(child, text) self.paragraph_with_marker( text, tree_utils.get_node_text_tags_preserved(child)) elif child.tag == 'SEQUENCE': old_depth = self.depth self.end_group() self.depth = old_depth self.process_sequence(child) elif child.tag in ('P', 'FP'): text = self.insert_dashes(child, text) self.paragraph_no_marker(text) elif child.tag == 'GPH': self.graphic(child) elif child.tag == 'GPOTABLE': self.table(child) elif child.tag in ('NOTE', 'NOTES'): self.fence(child, 'note') elif child.tag == 'CODE': self.fence(child, child.get('LANGUAGE', 'code')) self.end_group() while self.m_stack.size() > 1: self.m_stack.unwind() if self.m_stack.m_stack[0]: return self.m_stack.m_stack[0][0][1]
def __init__(self, part): self.m_stack = tree_utils.NodeStack() self.part = part self.paragraph_counter = 0 self.header_count = 0 self.depth = 0 self.appendix_letter = None # holds collections of nodes until their depth is determined self.nodes = []
def test_process_inner_child_has_citation(): with XMLBuilder() as ctx: ctx.HD("Title") ctx.P("1. Something something see comment 22(a)-2.i. please") node = ctx.xml.xpath('//HD')[0] stack = tree_utils.NodeStack() gpo_cfr.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] assert tree.children == []
def test_process_inner_child_incorrect_xml(): with XMLBuilder('ROOT') as ctx: ctx.HD("Title") ctx.child_from_string('<P><E T="03">1.</E> 111</P>') ctx.P("i. iii") ctx.child_from_string('<P><E T="03">2.</E> 222 Incorrect Content</P>') node = ctx.xml.xpath('//HD')[0] stack = tree_utils.NodeStack() gpo_cfr.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() assert len(stack.m_stack[0]) == 2
def build_hierarchy(self, root, nodes, depths): """Given a root node, a flat list of child nodes, and a list of depths, build a node hierarchy around the root""" stack = tree_utils.NodeStack() stack.add(0, root) for node, depth_info in zip(nodes, depths): node.label = [mtypes.deemphasize(l) for l in node.label] self.replace_markerless(stack, node, depth_info.depth + 1) self.carry_label_to_children(node) if depth_info.typ != mtypes.stars: stack.add(1 + depth_info.depth, node) return stack.collapse()
def parse_from_xml(root, xml_nodes): """Core of supplement processing; shared by whole XML parsing and notice parsing. root is the root interpretation node (e.g. a Node with label '1005-Interp'). xml_nodes contains all XML nodes which will be relevant to the interpretations""" supplement_nodes = [root] last_label = root.label header_count = 0 for ch in xml_nodes: node = Node(label=last_label, node_type=Node.INTERP) label_obj = Label.from_node(node) # Explicitly ignore "subpart" headers, as they are inconsistent # and they will be reconstructed as subterps client-side text = tree_utils.get_node_text(ch, add_spaces=True) if is_title(ch) and 'subpart' not in text.lower(): labels = text_to_labels(text, label_obj) if labels: label = merge_labels(labels) else: # Header without a label, like an Introduction, etc. header_count += 1 label = root.label[:2] + ['h%d' % header_count] inner_stack = tree_utils.NodeStack() missing = missing_levels(last_label, label) supplement_nodes.extend(missing) last_label = label node = Node(node_type=Node.INTERP, label=label, title=text.strip()) inner_stack.add(2, node) process_inner_children(inner_stack, ch, parent=node) while inner_stack.size() > 1: inner_stack.unwind() ch_node = inner_stack.m_stack[0][0][1] supplement_nodes.append(ch_node) supplement_tree = treeify(supplement_nodes) def per_node(node): node.label = [l.replace('<E T="03">', '') for l in node.label] for child in node.children: per_node(child) for node in supplement_tree: per_node(node) return supplement_tree[0]
def test_process_inner_child_has_citation(self): xml = """ <ROOT> <HD>Title</HD> <P>1. Something something see comment 22(a)-2.i. please</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] self.assertEqual(0, len(tree.children))
def test_process_inner_child_incorrect_xml(self): xml = """ <ROOT> <HD>Title</HD> <P><E T="03">1.</E> 111</P> <P>i. iii</P> <P><E T="03">2.</E> 222 Incorrect Content</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() self.assertEqual(2, len(stack.m_stack[0]))
def test_unwind_stack(self): level_one_n = Node(label=['272']) level_two_n = Node(label=['a']) m_stack = tree_utils.NodeStack() m_stack.push_last((1, level_one_n)) m_stack.add(2, level_two_n) self.assertEquals(m_stack.size(), 2) m_stack.unwind() self.assertEquals(m_stack.size(), 1) n = m_stack.pop()[0][1] self.assertEqual(n.children[0].label, ['272', 'a'])
def test_process_inner_child_no_marker(): with XMLBuilder() as ctx: ctx.HD("Title") ctx.P("1. 111") ctx.P("i. iii") ctx.P("Howdy Howdy") node = ctx.xml.xpath('//HD')[0] stack = tree_utils.NodeStack() gpo_cfr.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() i1 = stack.m_stack[0][0][1] assert len(i1.children) == 1 i1i = i1.children[0] assert i1i.children == [] assert i1i.text == "i. iii\n\nHowdy Howdy"
def test_process_inner_child_space(): with XMLBuilder('ROOT') as ctx: ctx.HD("Title") ctx.P("1. 111") ctx.P("i. See country A. Not that country") node = ctx.xml.xpath('//HD')[0] stack = tree_utils.NodeStack() gpo_cfr.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() n1 = stack.m_stack[0][0][1] assert n1.label == ['1'] assert len(n1.children) == 1 n1i = n1.children[0] assert n1i.label == ['1', 'i'] assert n1i.children == []
def test_process_inner_child_stars_and_inline(): with XMLBuilder() as ctx: ctx.HD("Title") ctx.STARS() ctx.P("2. Content. * * *") ctx.STARS() ctx.P("xi. Content") ctx.STARS() node = ctx.xml.xpath('//HD')[0] stack = tree_utils.NodeStack() gpo_cfr.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] assert tree.label == ['2'] assert len(tree.children) == 1 assert tree.children[0].label == ['2', 'xi'] assert tree.children[0].children == []
def test_process_inner_child_no_marker(self): xml = """ <ROOT> <HD>Title</HD> <P>1. 111</P> <P>i. iii</P> <P>Howdy Howdy</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() i1 = stack.m_stack[0][0][1] self.assertEqual(1, len(i1.children)) i1i = i1.children[0] self.assertEqual(0, len(i1i.children)) self.assertEqual(i1i.text.strip(), "i. iii\n\nHowdy Howdy")
def test_process_inner_child_collapsed_i(): with XMLBuilder() as ctx: ctx.HD("Title") ctx.child_from_string( '<P>1. <E T="03">Keyterm text</E> i. Content content</P>') ctx.P("ii. Other stuff") node = ctx.xml.xpath('//HD')[0] stack = tree_utils.NodeStack() gpo_cfr.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] assert tree.label == ['1'] assert len(tree.children) == 2 assert tree.children[0].label == ['1', 'i'] assert tree.children[0].children == [] assert tree.children[1].label == ['1', 'ii'] assert tree.children[1].children == []
def test_process_inner_child_collapsed_i(self): xml = """ <ROOT> <HD>Title</HD> <P>1. <E T="03">Keyterm text</E> i. Content content</P> <P>ii. Other stuff</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] self.assertEqual(['1'], tree.label) self.assertEqual(2, len(tree.children)) self.assertEqual(['1', 'i'], tree.children[0].label) self.assertEqual(0, len(tree.children[0].children)) self.assertEqual(['1', 'ii'], tree.children[1].label) self.assertEqual(0, len(tree.children[1].children))
def test_process_inner_child_space(self): xml = """ <ROOT> <HD>Title</HD> <P>1. 111</P> <P>i. See country A. Not that country</P> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() n1 = stack.m_stack[0][0][1] self.assertEqual(['1'], n1.label) self.assertEqual(1, len(n1.children)) n1i = n1.children[0] self.assertEqual(['1', 'i'], n1i.label) self.assertEqual(0, len(n1i.children))
def test_process_inner_child_stars_and_inline(self): xml = """ <ROOT> <HD>Title</HD> <STARS /> <P>2. Content. * * *</P> <STARS /> <P>xi. Content</P> <STARS /> </ROOT>""" node = etree.fromstring(xml).xpath('//HD')[0] stack = tree_utils.NodeStack() interpretations.process_inner_children(stack, node) while stack.size() > 1: stack.unwind() tree = stack.m_stack[0][0][1] self.assertEqual(['2'], tree.label) self.assertEqual(1, len(tree.children)) self.assertEqual(['2', 'xi'], tree.children[0].label) self.assertEqual(0, len(tree.children[0].children))
def setUp(self): self.ap = appendices.AppendixProcessor() self.ap.paragraph_counter = 0 self.ap.depth = 0 self.ap.m_stack = tree_utils.NodeStack() self.ap.nodes = []
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] section_no = section_xml.xpath('SECTNO')[0].text section_no_without_marker = re.search('[0-9]+\.[0-9]+', section_no).group(0) subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text manual_hierarchy_flag = False if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[ reg_part]: manual_hierarchy_flag = True # Collect paragraph markers and section text (intro text for the # section) i = 0 children = [ ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS'] ] for ch in children: text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: # is this a bunch of definitions that don't have numbers next to them? if len(nodes) > 0: if (subject_text.find('Definitions.') > -1 or nodes[-1].text.find( 'For the purposes of this section')): #TODO: create a grammar for definitions if text.find('means') > -1: def_marker = text.split('means')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) elif text.find('shall have the same meaning') > -1: def_marker = text.split('shall')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) else: def_marker = 'def{0}'.format(i) i += 1 n = Node(text, label=[def_marker], source_xml=ch) n.tagged_text = tagged_text #nodes[-1].children.append(n) nodes.append(n) else: section_texts.append((text, tagged_text)) else: if len(children) > 1: def_marker = 'def{0}'.format(i) n = Node(text, [], [def_marker], source_xml=ch) n.tagged_text = tagged_text i += 1 nodes.append(n) else: # this is the only node around section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] m_stack = tree_utils.NodeStack() # Use constraint programming to figure out possible depth assignments if not manual_hierarchy_flag: depths = derive_depths([n.label[0] for n in nodes], [ rules.depth_type_order([ mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman ]) ]) if not manual_hierarchy_flag and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) elif nodes and manual_hierarchy_flag: logging.warning('Using manual depth hierarchy.') depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker] if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + depth, node)) else: m_stack.add(1 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!' ' ({0} nodes but {1} provided)'.format(len(nodes), len(depths))) elif nodes and not manual_hierarchy_flag: logging.warning( 'Could not determine depth when parsing {0}:\n{1}'.format( section_no_without_marker, [n.label[0] for n in nodes])) for node in nodes: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((3, node)) else: m_stack.add(3, node) nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] # Collect paragraph markers and section text (intro text for the # section) for ch in filter(lambda ch: ch.tag in ('P', 'STARS'), section_xml.getchildren()): text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) m_stack = tree_utils.NodeStack() if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes