def add_nodes_to_stack(nodes, inner_stack): """Calculate most likely depth assignments to each node; add to the provided stack""" # Use constraint programming to figure out possible depth assignments depths = derive_depths([node.label[0] for node in nodes], [ rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman]) ]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def select_depth(self, depths): """Override ParagraphProcessor to add different weights""" depths = heuristics.prefer_diff_types_diff_levels(depths, 0.2) depths = heuristics.prefer_multiple_children(depths, 0.4) depths = heuristics.prefer_shallow_depths(depths, 0.8) depths = heuristics.prefer_no_markerless_sandwich(depths, 0.2) depths = sorted(depths, key=lambda d: d.weight, reverse=True) return depths[0]
def select_depth(self, depths): """There might be multiple solutions to our depth processing problem. Use heuristics to select one.""" depths = heuristics.prefer_diff_types_diff_levels(depths, 0.8) depths = heuristics.prefer_multiple_children(depths, 0.4) depths = heuristics.prefer_shallow_depths(depths, 0.2) depths = sorted(depths, key=lambda d: d.weight, reverse=True) return depths[0]
def select_depth(self, depths): """There might be multiple solutions to our depth processing problem. Use heuristics to select one.""" depths = heuristics.prefer_diff_types_diff_levels(depths, 0.8) depths = heuristics.prefer_multiple_children(depths, 0.4) depths = heuristics.prefer_shallow_depths(depths, 0.2) depths = heuristics.prefer_no_markerless_sandwich(depths, 0.2) depths = sorted(depths, key=lambda d: d.weight, reverse=True) return depths[0]
def test_prefer_multiple_children(self): solution1 = {'type0': markers.lower, 'idx0': 0, 'depth0': 0, # a 'type1': markers.lower, 'idx1': 1, 'depth1': 0, # b 'type2': markers.lower, 'idx2': 2, 'depth2': 0, 'type3': markers.lower, 'idx3': 3, 'depth3': 0, 'type4': markers.lower, 'idx4': 4, 'depth4': 0, 'type5': markers.lower, 'idx5': 5, 'depth5': 0, 'type6': markers.lower, 'idx6': 6, 'depth6': 0, 'type7': markers.lower, 'idx7': 7, 'depth7': 0, # h 'type8': markers.lower, 'idx8': 8, 'depth8': 0} # i solution2 = solution1.copy() solution2['type8'] = markers.roman solution2['idx8'] = 0 solution2['depth8'] = 1 solutions = [Solution(solution1), Solution(solution2)] solutions = prefer_multiple_children(solutions, 0.5) self.assertEqual(solutions[0].weight, 1.0) self.assertTrue(solutions[1].weight < solutions[0].weight)
def test_prefer_multiple_children(self): """Should a trailing i be a roman numeral or a lower case?""" self.add_assignment(markers.lower, 'a', 0) self.add_assignment(markers.lower, 'b', 0) self.add_assignment(markers.lower, 'c', 0) self.add_assignment(markers.lower, 'd', 0) self.add_assignment(markers.lower, 'e', 0) self.add_assignment(markers.lower, 'f', 0) self.add_assignment(markers.lower, 'g', 0) self.add_assignment(markers.lower, 'h', 0) self.add_assignment(markers.lower, 'i', 0) solution1 = self.solution solution2 = solution1.copy() solution2['type8'] = markers.roman solution2['idx8'] = 0 solution2['depth8'] = 1 solutions = [Solution(solution1), Solution(solution2)] solutions = heuristics.prefer_multiple_children(solutions, 0.5) self.assertEqual(solutions[0].weight, 1.0) self.assertTrue(solutions[1].weight < solutions[0].weight)
def add_nodes_to_stack(nodes, inner_stack): """Calculate most likely depth assignments to each node; add to the provided stack""" # Use constraint programming to figure out possible depth assignments depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def process_inner_children(inner_stack, xml_node, parent=None): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" # manual hierarchy should work here too manual_hierarchy = [] try: part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0) part, section = part_and_section.split('.') part_and_section += '-Interp' if (part in PARAGRAPH_HIERARCHY and part_and_section in PARAGRAPH_HIERARCHY[part]): manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section] except Exception: pass children = itertools.takewhile(lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for i, xml_node in enumerate( filter(lambda c: c.tag in ('P', 'STARS'), children)): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) # If the node has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if xml_node.get("depth") is not None: manual_hierarchy.append(int(xml_node.get("depth"))) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes and manual_hierarchy: logging.warning("Couldn't determine interp marker. " "Manual hierarchy is specified") n = Node(node_text, label=[str(i)], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) elif not first_marker and not manual_hierarchy: logging.warning( "Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) if nodes: previous = nodes[-1] else: previous = parent previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments # use manual hierarchy if it's specified if not manual_hierarchy: depths = derive_depths([node.label[0] for node in nodes], [ rules.depth_type_order( [(mtypes.ints, mtypes.em_ints), (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman]) ]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + depth, node)) else: inner_stack.add(3 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!') elif nodes and not manual_hierarchy: logging.warning('Could not derive depth (interp):\n {}'.format( [node.label[0] for node in nodes])) # just add nodes in sequential order then for node in nodes: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3, node)) else: inner_stack.add(3, node)
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] # Collect paragraph markers and section text (intro text for the # section) for ch in filter(lambda ch: ch.tag in ('P', 'STARS'), section_xml.getchildren()): text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) m_stack = tree_utils.NodeStack() if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] section_no = section_xml.xpath('SECTNO')[0].text section_no_without_marker = re.search('[0-9]+\.[0-9]+', section_no).group(0) subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text manual_hierarchy_flag = False if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[ reg_part]: manual_hierarchy_flag = True # Collect paragraph markers and section text (intro text for the # section) i = 0 children = [ ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS'] ] for ch in children: text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: # is this a bunch of definitions that don't have numbers next to them? if len(nodes) > 0: if (subject_text.find('Definitions.') > -1 or nodes[-1].text.find( 'For the purposes of this section')): #TODO: create a grammar for definitions if text.find('means') > -1: def_marker = text.split('means')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) elif text.find('shall have the same meaning') > -1: def_marker = text.split('shall')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) else: def_marker = 'def{0}'.format(i) i += 1 n = Node(text, label=[def_marker], source_xml=ch) n.tagged_text = tagged_text #nodes[-1].children.append(n) nodes.append(n) else: section_texts.append((text, tagged_text)) else: if len(children) > 1: def_marker = 'def{0}'.format(i) n = Node(text, [], [def_marker], source_xml=ch) n.tagged_text = tagged_text i += 1 nodes.append(n) else: # this is the only node around section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] m_stack = tree_utils.NodeStack() # Use constraint programming to figure out possible depth assignments if not manual_hierarchy_flag: depths = derive_depths([n.label[0] for n in nodes], [ rules.depth_type_order([ mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman ]) ]) if not manual_hierarchy_flag and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) elif nodes and manual_hierarchy_flag: logging.warning('Using manual depth hierarchy.') depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker] if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + depth, node)) else: m_stack.add(1 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!' ' ({0} nodes but {1} provided)'.format(len(nodes), len(depths))) elif nodes and not manual_hierarchy_flag: logging.warning( 'Could not determine depth when parsing {0}:\n{1}'.format( section_no_without_marker, [n.label[0] for n in nodes])) for node in nodes: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((3, node)) else: m_stack.add(3, node) nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def process_inner_children(inner_stack, xml_node, parent=None): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" # manual hierarchy should work here too manual_hierarchy = [] try: part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0) part, section = part_and_section.split('.') part_and_section += '-Interp' if (part in PARAGRAPH_HIERARCHY and part_and_section in PARAGRAPH_HIERARCHY[part]): manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section] except Exception: pass children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for i, xml_node in enumerate(filter(lambda c: c.tag in ('P', 'STARS'), children)): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) # If the node has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if xml_node.get("depth") is not None: manual_hierarchy.append(int(xml_node.get("depth"))) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes and manual_hierarchy: logging.warning("Couldn't determine interp marker. " "Manual hierarchy is specified") n = Node(node_text, label=[str(i)], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) elif not first_marker and not manual_hierarchy: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) if nodes: previous = nodes[-1] else: previous = parent previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments # use manual hierarchy if it's specified if not manual_hierarchy: depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([ (mtypes.ints, mtypes.em_ints), (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + depth, node)) else: inner_stack.add(3 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!') elif nodes and not manual_hierarchy: logging.warning('Could not derive depth (interp):\n {}'.format( [node.label[0] for node in nodes])) # just add nodes in sequential order then for node in nodes: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3, node)) else: inner_stack.add(3, node)
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] section_no = section_xml.xpath('SECTNO')[0].text section_no_without_marker = re.search('[0-9]+\.[0-9]+', section_no).group(0) subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text manual_hierarchy = [] if (reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[reg_part]): manual_hierarchy = PARAGRAPH_HIERARCHY[reg_part][ section_no_without_marker] # Collect paragraph markers and section text (intro text for the # section) i = 0 children = [ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS']] for ch in children: text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) # If the child has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if ch.get("depth") is not None: manual_hierarchy.append(int(ch.get("depth"))) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list and manual_hierarchy: # is this a bunch of definitions that don't have numbers next to # them? if len(nodes) > 0: if (subject_text.find('Definitions.') > -1 or nodes[-1].text.find( 'For the purposes of this section')): # TODO: create a grammar for definitions if text.find('means') > -1: def_marker = text.split('means')[0].strip().split() def_marker = ''.join([word[0].upper() + word[1:] for word in def_marker]) elif text.find('shall have the same meaning') > -1: def_marker = text.split('shall')[0].strip().split() def_marker = ''.join([word[0].upper() + word[1:] for word in def_marker]) else: def_marker = 'def{0}'.format(i) i += 1 n = Node(text, label=[def_marker], source_xml=ch) n.tagged_text = tagged_text nodes.append(n) else: section_texts.append((text, tagged_text)) else: if len(children) > 1: def_marker = 'def{0}'.format(i) n = Node(text, [], [def_marker], source_xml=ch) n.tagged_text = tagged_text i += 1 nodes.append(n) else: # this is the only node around section_texts.append((text, tagged_text)) elif not markers_list and not manual_hierarchy: # No manual heirarchy specified, append to the section. section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] m_stack = tree_utils.NodeStack() # Use constraint programming to figure out possible depth assignments if not manual_hierarchy: depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, spec in zip(nodes, depths): if isinstance(spec, int): depth = spec elif isinstance(spec, tuple): depth, marker = spec node.marker = marker last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + depth, node)) else: m_stack.add(1 + depth, node) else: logging.error('Manual hierarchy length does not match node ' 'list length! ({0} nodes but {1} provided, ' '{2})'.format( len(nodes), len(depths), [x.label[0] for x in nodes])) elif nodes and not manual_hierarchy: logging.warning( 'Could not determine depth when parsing {0}:\n{1}'.format( section_no_without_marker, [node.label[0] for node in nodes])) for node in nodes: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((3, node)) else: m_stack.add(3, node) nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes