def test_get_node_text(self): text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>' doc = etree.fromstring(text) result = tree_utils.get_node_text(doc) self.assertEquals('(a)Fruit.Apps, and pins', result) text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>' doc = etree.fromstring(text) result = tree_utils.get_node_text(doc, add_spaces=True) self.assertEquals('(a) Fruit. Apps, and pins', result) text = '<P>(a) <E T="03">Fruit.</E> Apps, and pins</P>' doc = etree.fromstring(text) result = tree_utils.get_node_text(doc, add_spaces=True) self.assertEquals('(a) Fruit. Apps, and pins', result) text = '<P>(a) ABC<E T="52">123</E>= 5</P>' doc = etree.fromstring(text) result = tree_utils.get_node_text(doc, add_spaces=True) self.assertEquals('(a) ABC_{123} = 5', result) text = '<P>(a) <E>Keyterm.</E> ABC<E T="52">123</E>= 5</P>' doc = etree.fromstring(text) result = tree_utils.get_node_text(doc, add_spaces=True) self.assertEquals('(a) Keyterm. ABC_{123} = 5', result)
def table_xml_to_data(xml_node): """Construct a data structure of the table data. We provide a different structure than the native XML as the XML encodes too much logic. This structure can be used to generate semi-complex tables which could not be generated from the markdown above""" header_root = build_header(xml_node.xpath('./BOXHD/CHED')) header = [[] for _ in range(header_root.height())] def per_node(node): header[node.level].append({'text': node.text, 'colspan': node.colspan, 'rowspan': node.rowspan}) struct.walk(header_root, per_node) header = header[1:] # skip the root rows = [] for row in xml_node.xpath('./ROW'): rows.append([tree_utils.get_node_text(td, add_spaces=True).strip() for td in row.xpath('./ENT')]) table_data = {'header': header, 'rows': rows} caption_nodes = xml_node.xpath('./TTITLE') if len(caption_nodes): text = tree_utils.get_node_text(caption_nodes[0]).strip() table_data["caption"] = text return table_data
def table_xml_to_plaintext(xml_node): """Markdown representation of a table. Note that this doesn't account for all the options needed to display the table properly, but works fine for simple tables. This gets included in the reg plain text""" header = [tree_utils.get_node_text(hd, add_spaces=True).strip() for hd in xml_node.xpath('./BOXHD/CHED|./TTITLE')] divider = ['---']*len(header) rows = [] for tr in xml_node.xpath('./ROW'): rows.append([tree_utils.get_node_text(td, add_spaces=True).strip() for td in tr.xpath('./ENT')]) table = [] for row in [header] + [divider] + rows: table.append('|' + '|'.join(row) + '|') return '\n'.join(table)
def parse_amdpar(par, initial_context): """ Parse the <AMDPAR> tags into a list of paragraphs that have changed. """ # Replace and "and"s in titles; they will throw off and_token_resolution for e in filter(lambda e: e.text, par.xpath('./E')): e.text = e.text.replace(' and ', ' ') text = get_node_text(par, add_spaces=True) tokenized = [t[0] for t, _, _ in amdpar.token_patterns.scanString(text)] tokenized = compress_context_in_tokenlists(tokenized) tokenized = resolve_confused_context(tokenized, initial_context) tokenized = paragraph_in_context_moved(tokenized, initial_context) tokenized = remove_false_deletes(tokenized, text) tokenized = multiple_moves(tokenized) tokenized = switch_passive(tokenized) tokenized = and_token_resolution(tokenized) tokenized, subpart = deal_with_subpart_adds(tokenized) tokenized = context_to_paragraph(tokenized) tokenized = move_then_modify(tokenized) if not subpart: tokenized = separate_tokenlist(tokenized) initial_context = switch_context(tokenized, initial_context) tokenized, final_context = compress_context(tokenized, initial_context) amends = make_amendments(tokenized, subpart) return amends, final_context
def nodes_from_interp_p(xml_node): """Given an XML node that contains text for an interpretation paragraph, split it into sub-paragraphs and account for trailing stars""" node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP, tagged_text=text_with_tags) yield n if n.text.endswith('* * *'): yield Node(label=[mtypes.INLINE_STARS]) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) yield n if n.text.endswith('* * *'): yield Node(label=[mtypes.INLINE_STARS])
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logger.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if previous.tagged_text: previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: nodes.extend(nodes_from_interp_p(xml_node)) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] add_nodes_to_stack(nodes, inner_stack)
def build_header(xml_nodes): """Builds a TableHeaderNode tree, with an empty root. Each node in the tree includes its colspan/rowspan""" stack = HeaderStack() stack.add(0, TableHeaderNode(None, 0)) # Root for xml_node in xml_nodes: level = int(xml_node.attrib['H']) text = tree_utils.get_node_text(xml_node, add_spaces=True).strip() stack.add(level, TableHeaderNode(text, level)) while stack.size() > 1: stack.unwind() root = stack.m_stack[0][0][1] max_height = root.height() def set_rowspan(n): n.rowspan = max_height - n.height() - n.level + 1 struct.walk(root, set_rowspan) def set_colspan(n): n.colspan = n.width() struct.walk(root, set_colspan) return root
def derive_nodes(self, xml, processor=None): texts = ["```" + self.fence_type(xml)] for child in xml: texts.append(tree_utils.get_node_text(child).strip()) texts.append("```") return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
def make_authority_instructions(auth_xml, cfr_part): """Creates an `EREGS_INSTRUCTIONS` element specific to the authority information""" instructions = etree.Element('EREGS_INSTRUCTIONS') authority = etree.SubElement(instructions, 'AUTHORITY', label=cfr_part) authority.text = '\n'.join(get_node_text(p, add_spaces=True) for p in auth_xml.xpath('./P')) return instructions
def process(self, appendix, part): self.m_stack = tree_utils.NodeStack() self.part = part self.paragraph_count = 0 self.header_count = 0 self.depth = None self.appendix_letter = None # holds collections of nodes until their depth is determined self.nodes = [] self.set_letter(appendix) remove_toc(appendix, self.appendix_letter) def is_subhead(tag, text): initial = initial_marker(text) return ((tag == 'HD' and (not initial or '.' in initial[1])) or (tag in ('P', 'FP') and title_label_pair(text, self.appendix_letter, self.part))) for child in appendix.getchildren(): text = tree_utils.get_node_text(child, add_spaces=True).strip() if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED') or child.tag == 'RESERVED'): self.end_group() self.hed(part, text) elif is_subhead(child.tag, text): self.end_group() self.subheader(child, text) elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'): text = self.insert_dashes(child, text) self.paragraph_with_marker( text, tree_utils.get_node_text_tags_preserved(child)) elif child.tag == 'SEQUENCE': old_depth = self.depth self.end_group() self.depth = old_depth self.process_sequence(child) elif child.tag in ('P', 'FP'): text = self.insert_dashes(child, text) self.paragraph_no_marker(text) elif child.tag == 'GPH': self.graphic(child) elif child.tag == 'GPOTABLE': self.table(child) elif child.tag in ('NOTE', 'NOTES'): self.fence(child, 'note') elif child.tag == 'CODE': self.fence(child, child.get('LANGUAGE', 'code')) self.end_group() while self.m_stack.size() > 1: self.m_stack.unwind() if self.m_stack.m_stack[0]: return self.m_stack.m_stack[0][0][1]
def process_appendix(m_stack, current_section, child): html_parser = HTMLParser.HTMLParser() for ch in child.getchildren(): if ch.tag == 'HD': appendix_section = get_appendix_section_number( ch.text, current_section) if appendix_section is None: appendix_section = determine_next_section(m_stack, 2) n = Node( node_type=Node.APPENDIX, label=[appendix_section], title=ch.text) node_level = 2 tree_utils.add_to_stack(m_stack, node_level, n) if ch.tag == 'P': text = ' '.join([ch.text] + [c.tail for c in ch if c.tail]) markers_list = tree_utils.get_paragraph_markers(text) node_text = tree_utils.get_node_text(ch) if len(markers_list) > 0: if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] node_text = tree_utils.split_text( node_text, actual_markers) else: node_text = [node_text] for m, node_text in zip(markers_list, node_text): n = Node( node_text, label=[str(m)], node_type=Node.APPENDIX) last = m_stack.peek() node_level = determine_level(m, last[0][0]) if m == 'i': #This is bit of a hack, since we can't easily #distinguish between the Roman numeral #(i) and the #letter (i) to determine the level. We look ahead to #help. This is not #a complete solution and we should #circle back at some point. next_text = ' '.join( [ch.getnext().text] + [c.tail for c in ch.getnext() if c.tail]) next_markers = tree_utils.get_paragraph_markers( next_text) if next_markers[0] == 'ii': node_level = 5 tree_utils.add_to_stack(m_stack, node_level, n) else: last = m_stack.peek_last() last[1].text = last[1].text + '\n %s' % node_text
def set_letter(self, appendix): """Find (and set) the appendix letter""" for hd in appendix_headers(appendix): text = tree_utils.get_node_text(hd) if self.appendix_letter: logger.warning("Found two appendix headers: %s and %s", self.appendix_letter, text) self.appendix_letter = grammar.headers.parseString(text).appendix return self.appendix_letter
def derive_nodes(self, xml, processor=None): texts = ["```" + xml.get('LANGUAGE', 'code')] for child in xml: text = tree_utils.get_node_text(child).strip() if text: texts.append(text) texts.append("```") return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
def process(self, appendix, part): self.m_stack = tree_utils.NodeStack() self.paragraph_count = 0 self.header_count = 0 self.depth = None self.appendix_letter = None self.set_letter(appendix) remove_toc(appendix, self.appendix_letter) def is_subhead(tag, text): initial = initial_marker(text) return ((tag == 'HD' and (not initial or '.' in initial[1])) or (tag in ('P', 'FP') and title_label_pair(text, self.appendix_letter))) for child in appendix.getchildren(): text = tree_utils.get_node_text(child, add_spaces=True).strip() if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED') or child.tag == 'RESERVED'): self.hed(part, text) elif is_subhead(child.tag, text): self.subheader(child, text) elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'): if child.getnext() is None: next_text = '' else: next_text = self.find_next_text_with_marker( child.getnext()) or '' texts = self.split_paragraph_text(text, next_text) for text, next_text in zip(texts, texts[1:]): self.paragraph_with_marker(text, next_text) elif child.tag in ('P', 'FP'): self.paragraph_no_marker(text) elif child.tag == 'GPH': self.graphic(child) elif child.tag == 'GPOTABLE': self.table(child) elif child.tag in ('NOTE', 'NOTES'): self.fence(child, 'note') elif child.tag == 'CODE': self.fence(child, child.get('LANGUAGE', 'code')) while self.m_stack.size() > 1: self.m_stack.unwind() if self.m_stack.m_stack[0]: root = self.m_stack.m_stack[0][0][1] def per_node(n): if hasattr(n, 'p_level'): del n.p_level walk(root, per_node) return root
def derive_nodes(self, xml, processor): """Finds and deletes the category header before recursing. Adds this header as a title.""" xml = deepcopy(xml) # we'll be modifying this header = xml.xpath('./HD')[0] xml.remove(header) header_text = tree_utils.get_node_text(header) node = Node(title=header_text, label=[self.marker(header_text)]) return [processor.process(xml, node)]
def set_letter(self, appendix): """Find (and set) the appendix letter""" for node in (c for c in appendix.getchildren() if is_appendix_header(c)): text = tree_utils.get_node_text(node) if self.appendix_letter: logging.warning("Found two appendix headers: %s and %s", self.appendix_letter, text) self.appendix_letter = headers.parseString(text).appendix return self.appendix_letter
def process_sequence(self, root): for child in root.getchildren(): text = tree_utils.get_node_text(child, add_spaces=True).strip() text = self.insert_dashes(child, text) self.paragraph_with_marker( text, tree_utils.get_node_text_tags_preserved(child)) old_depth = self.depth self.depth += 1 self.end_group() self.depth = old_depth
def fence(self, xml_node, fence_type): """Use github-like fencing to indicate this is a note or code""" self.paragraph_counter += 1 texts = ["```" + fence_type] for child in xml_node: texts.append(tree_utils.get_node_text(child).strip()) texts.append("```") n = Node("\n".join(texts), node_type=Node.APPENDIX, label=['p' + str(self.paragraph_counter)], source_xml=xml_node) self.nodes.append(n)
def fence(self, xml_node, fence_type): """Use github-like fencing to indicate this is a note or code""" self.paragraph_counter += 1 texts = ["```" + fence_type] for child in xml_node: texts.append(tree_utils.get_node_text(child).strip()) texts.append("```") n = Node("\n".join(texts), node_type=Node.APPENDIX, label=['p' + str(self.paragraph_counter)], source_xml=xml_node) self.nodes.append(n)
def set_letter(self, appendix): """Find (and set) the appendix letter""" for node in (c for c in appendix.getchildren() if is_appendix_header(c)): text = tree_utils.get_node_text(node) if self.appendix_letter: logging.warning("Found two appendix headers: %s and %s", self.appendix_letter, text) parsed_header = headers.parseString(text) self.appendix_letter = parsed_header.appendix return self.appendix_letter
def test_appendix_headers(): with XMLBuilder('APPENDIX') as ctx: ctx.EAR('1') ctx.HD('2', SOURCE='HED') ctx.P('3') ctx.HD('4', SOURCE='HD1') ctx.GPH('5') ctx.RESERVED('6') with ctx.WHED(): ctx.E('7') headers = [get_node_text(h) for h in appendices.appendix_headers(ctx.xml)] assert headers == ['2', '6', '7']
def process(self, appendix, part): self.m_stack = tree_utils.NodeStack() self.part = part self.paragraph_counter = 0 self.header_count = 0 self.depth = None self.appendix_letter = None # holds collections of nodes until their depth is determined self.nodes = [] self.set_letter(appendix) remove_toc(appendix, self.appendix_letter) def is_subhead(tag, text): initial = initial_marker(text) return ((tag == 'HD' and (not initial or '.' in initial[1])) or (tag in ('P', 'FP') and title_label_pair(text, self.appendix_letter, self.part))) for child in appendix.getchildren(): text = tree_utils.get_node_text(child, add_spaces=True).strip() if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED') or child.tag == 'RESERVED'): self.end_group() self.hed(part, text) elif is_subhead(child.tag, text): self.end_group() self.subheader(child, text) elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'): text = self.insert_dashes(child, text) self.paragraph_with_marker( text, tree_utils.get_node_text_tags_preserved(child)) elif child.tag in ('P', 'FP'): text = self.insert_dashes(child, text) self.paragraph_no_marker(text) elif child.tag == 'GPH': self.graphic(child) elif child.tag == 'GPOTABLE': self.table(child) elif child.tag in ('NOTE', 'NOTES'): self.fence(child, 'note') elif child.tag == 'CODE': self.fence(child, child.get('LANGUAGE', 'code')) self.end_group() while self.m_stack.size() > 1: self.m_stack.unwind() if self.m_stack.m_stack[0]: return self.m_stack.m_stack[0][0][1]
def parse_from_xml(root, xml_nodes): """Core of supplement processing; shared by whole XML parsing and notice parsing. root is the root interpretation node (e.g. a Node with label '1005-Interp'). xml_nodes contains all XML nodes which will be relevant to the interpretations""" supplement_nodes = [root] last_label = root.label header_count = 0 for ch in xml_nodes: node = Node(label=last_label, node_type=Node.INTERP) label_obj = Label.from_node(node) # Explicitly ignore "subpart" headers, as they are inconsistent # and they will be reconstructed as subterps client-side text = tree_utils.get_node_text(ch, add_spaces=True) if is_title(ch) and 'subpart' not in text.lower(): labels = text_to_labels(text, label_obj) if labels: label = merge_labels(labels) else: # Header without a label, like an Introduction, etc. header_count += 1 label = root.label[:2] + ['h%d' % header_count] inner_stack = tree_utils.NodeStack() missing = missing_levels(last_label, label) supplement_nodes.extend(missing) last_label = label node = Node(node_type=Node.INTERP, label=label, title=text.strip()) inner_stack.add(2, node) process_inner_children(inner_stack, ch, parent=node) while inner_stack.size() > 1: inner_stack.unwind() ch_node = inner_stack.m_stack[0][0][1] supplement_nodes.append(ch_node) supplement_tree = treeify(supplement_nodes) def per_node(node): node.label = [l.replace('<E T="03">', '') for l in node.label] for child in node.children: per_node(child) for node in supplement_tree: per_node(node) return supplement_tree[0]
def parse_from_xml(root, xml_nodes): """Core of supplement processing; shared by whole XML parsing and notice parsing. root is the root interpretation node (e.g. a Node with label '1005-Interp'). xml_nodes contains all XML nodes which will be relevant to the interpretations""" supplement_nodes = [root] last_label = root.label header_count = 0 for ch in xml_nodes: node = Node(label=last_label, node_type=Node.INTERP) label_obj = Label.from_node(node) # Explicitly ignore "subpart" headers, as they are inconsistent # and they will be reconstructed as subterps client-side text = tree_utils.get_node_text(ch, add_spaces=True) if is_title(ch) and 'subpart' not in text.lower(): labels = text_to_labels(text, label_obj) if labels: label = merge_labels(labels) else: # Header without a label, like an Introduction, etc. header_count += 1 label = root.label[:2] + ['h%d' % header_count] inner_stack = tree_utils.NodeStack() missing = missing_levels(last_label, label) supplement_nodes.extend(missing) last_label = label node = Node(node_type=Node.INTERP, label=label, title=text.strip()) inner_stack.add(2, node) process_inner_children(inner_stack, ch, parent=node) while inner_stack.size() > 1: inner_stack.unwind() ch_node = inner_stack.m_stack[0][0][1] supplement_nodes.append(ch_node) supplement_tree = treeify(supplement_nodes) def per_node(node): node.label = [l.replace('<E T="03">', '') for l in node.label] for child in node.children: per_node(child) for node in supplement_tree: per_node(node) return supplement_tree[0]
def add_ref_attributes(self, xml): """Modify each footnote reference so that it has an attribute containing its footnote content""" for ref in xml.xpath(self.XPATH_IS_REF): sus = ref.xpath(self.XPATH_FIND_NOTE_TPL.format(ref.text)) if sus and self.is_reasonably_close(ref, sus[0]): # copy as we need to modify note = deepcopy(sus[0].getparent()) # Modify note to remove the reference text; it's superfluous for su in note.xpath('./SU'): replace_xml_node_with_text(su, su.tail or '') ref.attrib['footnote'] = get_node_text(note).strip()
def find_next_text_with_marker(self, node): """Scan xml nodes and their neighbors looking for text that begins with a marker. When found, return it""" if node.tag == 'HD': # Next section; give up return None if node.tag in ('P', 'FP'): # Potential text text = tree_utils.get_node_text(node) pair = initial_marker(text) if pair: return text if node.getnext() is None: # end of the line return None return self.find_next_text_with_marker(node.getnext())
def add_ref_attributes(self, xml): """Modify each footnote reference so that it has an attribute containing its footnote content""" for ref in xml.xpath(self.XPATH_IS_REF): sus = ref.xpath(self.XPATH_FIND_NOTE_TPL.format(ref.text)) if sus and self.is_reasonably_close(ref, sus[0]): # copy as we need to modify note = deepcopy(sus[0].getparent()) # Modify note to remove the reference text; it's superfluous for su in note.xpath('./SU'): replace_xml_node_with_text(su, su.tail or '') ref.attrib['footnote'] = get_node_text(note).strip()
def note(self, xml_node): """Use github-like fencing to indicate this is a note""" self.paragraph_counter += 1 texts = ["```note"] for child in xml_node: texts.append(tree_utils.get_node_text(child).strip()) texts.append("```") n = Node("\n".join(texts), node_type=Node.APPENDIX, label=['p' + str(self.paragraph_counter)], source_xml=xml_node) self._indent_if_needed() self.m_stack.add(self.depth, n)
def derive_nodes(self, xml, processor=None): nodes = [] text = tree_utils.get_node_text(xml).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = self.paragraph_markers(text) with_parens = ['({})'.format(m) for m in markers_list] triplets = zip(markers_list, tree_utils.split_text(text, with_parens), tree_utils.split_text(tagged_text, with_parens)) for m, text, tagged_text in triplets: node = Node(text=text.strip(), label=[m], source_xml=xml) node.tagged_text = unicode(tagged_text.strip()) nodes.append(node) return nodes
def get_markers_and_text(node, markers_list): node_text = tree_utils.get_node_text(node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(node) if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] plain_markers = [m.replace('<E T="03">', '').replace('</E>', '') for m in actual_markers] node_texts = tree_utils.split_text(node_text, plain_markers) tagged_texts = tree_utils.split_text(text_with_tags, actual_markers) node_text_list = zip(node_texts, tagged_texts) elif markers_list: node_text_list = [(node_text, text_with_tags)] return zip(markers_list, node_text_list)
def get_markers_and_text(node, markers_list): node_text = tree_utils.get_node_text(node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(node) actual_markers = ['(%s)' % m for m in markers_list] plain_markers = [m.replace('<E T="03">', '').replace('</E>', '') for m in actual_markers] node_texts = tree_utils.split_text(node_text, plain_markers) tagged_texts = tree_utils.split_text(text_with_tags, actual_markers) node_text_list = zip(node_texts, tagged_texts) if len(node_text_list) > len(markers_list): # diff can only be 1 markers_list.insert(0, mtypes.MARKERLESS) return zip(markers_list, node_text_list)
def fetch_dates(xml): """Pull out any dates (and their types) from the XML. Not all notices have all types of dates, some notices have multiple dates of the same type.""" dates_field = xml.xpath('//EFFDATE/P') or xml.xpath('//DATES/P') dates = {} for par in dates_field: for sentence in get_node_text(par).split('.'): result_pair = parse_date_sentence(sentence.replace('\n', ' ')) if result_pair: date_type, date = result_pair dates[date_type] = dates.get(date_type, []) + [date] if dates: return dates
def get_markers_and_text(node, markers_list): node_text = tree_utils.get_node_text(node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(node) if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] plain_markers = [m.replace('<E T="03">', '').replace('</E>', '') for m in actual_markers] node_texts = tree_utils.split_text(node_text, plain_markers) tagged_texts = tree_utils.split_text(text_with_tags, actual_markers) node_text_list = zip(node_texts, tagged_texts) elif markers_list: node_text_list = [(node_text, text_with_tags)] return zip(markers_list, node_text_list)
def derive_nodes(self, xml, processor=None): text = tree_utils.get_node_text(xml).strip() node = Node(text=text, source_xml=xml) node.tagged_text = six.text_type( tree_utils.get_node_text_tags_preserved(xml).strip()) regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX match = regex.match(text) if match: node.label = [match.group('marker')] else: node.label = [mtypes.MARKERLESS] return [node]
def derive_nodes(self, xml, processor=None): nodes = [] text = tree_utils.get_node_text(xml).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = self.paragraph_markers(text) with_parens = ['({})'.format(m) for m in markers_list] triplets = zip(markers_list, tree_utils.split_text(text, with_parens), tree_utils.split_text(tagged_text, with_parens)) for m, text, tagged_text in triplets: node = Node(text=text.strip(), label=[m], source_xml=xml) node.tagged_text = six.text_type(tagged_text.strip()) nodes.append(node) return nodes
def derive_nodes(self, xml, processor=None): text = tree_utils.get_node_text(xml).strip() node = Node(text=text, source_xml=xml) node.tagged_text = unicode( tree_utils.get_node_text_tags_preserved(xml).strip()) regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX match = regex.match(text) if match: node.label = [match.group('marker')] else: node.label = [mtypes.MARKERLESS] return [node]
def build_section(reg_part, section_xml): p_level = 1 m_stack = NodeStack() section_texts = [] for ch in section_xml.getchildren(): if ch.tag == 'P': text = ' '.join([ch.text] + [c.tail for c in ch if c.tail]) markers_list = tree_utils.get_paragraph_markers(text) node_text = tree_utils.get_node_text(ch) if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] node_text = tree_utils.split_text(node_text, actual_markers) elif markers_list: node_text = [node_text] else: # Does not contain paragraph markers section_texts.append(node_text) for m, node_text in zip(markers_list, node_text): n = Node(node_text, [], [str(m)]) new_p_level = determine_level(m, p_level) last = m_stack.peek() if len(last) == 0: m_stack.push_last((new_p_level, n)) else: tree_utils.add_to_stack(m_stack, new_p_level, n) p_level = new_p_level section_title = section_xml.xpath('SECTNO')[0].text subject_text = section_xml.xpath('SUBJECT')[0].text if subject_text: section_title += " " + subject_text section_number_match = re.search(r'%s\.(\d+)' % reg_part, section_title) # Sometimes not reg text sections get mixed in if section_number_match: section_number = section_number_match.group(1) section_text = ' '.join([section_xml.text] + section_texts) sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: tree_utils.unwind_stack(m_stack) return m_stack.pop()[0][1]
def process_supplement(part, m_stack, child): """ Parse the Supplement sections and paragraphs. """ for ch in child.getchildren(): if ch.tag.upper() == 'HD': label_text = text_to_label(ch.text, part) n = Node(node_type=Node.INTERP, label=label_text, title=ch.text) node_level = 1 elif ch.tag.upper() == 'P': text = ' '.join([ch.text] + [c.tail for c in ch if c.tail]) marker = get_interpretation_markers(text) node_text = tree_utils.get_node_text(ch) n = Node(node_text, label=[marker], node_type=Node.INTERP) node_level = interpretation_level(marker) tree_utils.add_to_stack(m_stack, node_level, n)
def split_by_markers(xml): """Given an xml node, pull out triplets of (marker, plain-text following, text-with-tags following) for each subparagraph found""" plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = get_markers(tagged_text, next_marker(xml)) plain_markers = ['({})'.format(mtypes.deemphasize(m)) for m in markers_list] node_texts = tree_utils.split_text(plain_text, plain_markers) tagged_texts = tree_utils.split_text( tagged_text, ['({})'.format(m) for m in markers_list]) if len(node_texts) > len(markers_list): # due to initial MARKERLESS markers_list.insert(0, mtypes.MARKERLESS) return list(zip(markers_list, node_texts, tagged_texts))
def next_marker(xml_node, remaining_markers): """Try to determine the marker following the current xml_node. Remaining markers is a list of other marks *within* the xml_node. May return None""" # More markers in this xml node if remaining_markers: return remaining_markers[0][0] # Check the next xml node; skip over stars sib = xml_node.getnext() while sib is not None and sib.tag in ('STARS', 'PRTPAGE'): sib = sib.getnext() if sib is not None: next_text = tree_utils.get_node_text(sib) next_markers = get_markers(next_text) if next_markers: return next_markers[0]
def parse_intro(notice_xml, doc_id): """The introduction to the preamble includes some key paragraphs which we bundle together in an "intro" node""" root = Node(node_type='preamble_intro', label=[doc_id, 'intro'], title='Preamble introduction') parent_tags = ('AGY', 'ACT', 'SUM', 'DATES', 'ADD', 'FURINF') xpath = '|'.join('.//' + parent_tag for parent_tag in parent_tags) for xml in notice_xml.xpath(xpath): title = xml.xpath('./HD')[0].text.strip() paras = [get_node_text(p) for p in xml.xpath("./P")] parent_label = [doc_id, 'intro', 'p{}'.format(len(root.children) + 1)] children = [] for i, para in enumerate(paras, start=1): label = [doc_id, 'intro', 'p{}'.format(len(root.children) + 1), 'p{}'.format(i)] children.append(Node(text=para, node_type='preamble', label=label)) root.children.append(Node(node_type='preamble', label=parent_label, title=title, children=children)) if root.children: return root
def remove_toc(appendix, letter): """The TOC at the top of certain appendices gives us trouble since it looks a *lot* like a sequence of headers. Remove it if present""" fingerprints = set() potential_toc = set() for node in appendix.xpath("./HD[@SOURCE='HED']/following-sibling::*"): parsed = parsed_title(tree_utils.get_node_text(node), letter) if parsed: # The headers may not match character-per-character. Only # compare the parsed results. fingerprint = tuple(parsed) # Hit the real content if fingerprint in fingerprints and node.tag == 'HD': for el in potential_toc: el.getparent().remove(el) return else: fingerprints.add(fingerprint) potential_toc.add(node) elif node.tag != 'GPH': # Not a title and not a img => no TOC return
def parse_amdpar(par, initial_context): """ Parse the <AMDPAR> tags into a list of paragraphs that have changed. """ # Replace and "and"s in titles; they will throw off and_token_resolution for e in filter(lambda e: e.text, par.xpath('./E')): e.text = e.text.replace(' and ', ' ') text = get_node_text(par, add_spaces=True) auth = par.getnext() # potential authority info if auth is not None and auth.tag != 'AUTH': auth = None tokenized = [t[0] for t, _, _ in amdpar.token_patterns.scanString(text)] tokenized = compress_context_in_tokenlists(tokenized) tokenized = resolve_confused_context(tokenized, initial_context) tokenized = paragraph_in_context_moved(tokenized, initial_context) tokenized = remove_false_deletes(tokenized, text) tokenized = multiple_moves(tokenized) tokenized = switch_passive(tokenized) tokenized = and_token_resolution(tokenized) tokenized, designated_subpart = subpart_designation(tokenized) tokenized = context_to_paragraph(tokenized) tokenized = move_then_modify(tokenized) if not designated_subpart: tokenized = separate_tokenlist(tokenized) initial_context = switch_part_context(tokenized, initial_context) initial_context = switch_level2_context(tokenized, initial_context) tokenized, final_context = compress_context(tokenized, initial_context) if designated_subpart: return make_subpart_designation_instructions(tokenized), final_context elif auth is not None: cfr_part = final_context[0] return make_authority_instructions(auth, cfr_part), final_context else: return make_instructions(tokenized), final_context
def get_appendix_title(node): """ Retrieve the first Appendix/Supplement title from its headers. """ return tree_utils.get_node_text(appendix_headers(node)[0])
def get_subpart_group_title(subpart_xml): """Derive the title of a subpart or subject group""" hds = subpart_xml.xpath('./RESERVED|./HD') if hds: return tree_utils.get_node_text(hds[0])
def derive_nodes(self, xml, processor=None): tagged = tree_utils.get_node_text_tags_preserved(xml).strip() return [Node(text=tree_utils.get_node_text(xml).strip(), tagged_text=tagged, label=[mtypes.MARKERLESS])]
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] section_no = section_xml.xpath('SECTNO')[0].text section_no_without_marker = re.search('[0-9]+\.[0-9]+', section_no).group(0) subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text manual_hierarchy_flag = False if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[ reg_part]: manual_hierarchy_flag = True # Collect paragraph markers and section text (intro text for the # section) i = 0 children = [ ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS'] ] for ch in children: text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: # is this a bunch of definitions that don't have numbers next to them? if len(nodes) > 0: if (subject_text.find('Definitions.') > -1 or nodes[-1].text.find( 'For the purposes of this section')): #TODO: create a grammar for definitions if text.find('means') > -1: def_marker = text.split('means')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) elif text.find('shall have the same meaning') > -1: def_marker = text.split('shall')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) else: def_marker = 'def{0}'.format(i) i += 1 n = Node(text, label=[def_marker], source_xml=ch) n.tagged_text = tagged_text #nodes[-1].children.append(n) nodes.append(n) else: section_texts.append((text, tagged_text)) else: if len(children) > 1: def_marker = 'def{0}'.format(i) n = Node(text, [], [def_marker], source_xml=ch) n.tagged_text = tagged_text i += 1 nodes.append(n) else: # this is the only node around section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] m_stack = tree_utils.NodeStack() # Use constraint programming to figure out possible depth assignments if not manual_hierarchy_flag: depths = derive_depths([n.label[0] for n in nodes], [ rules.depth_type_order([ mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman ]) ]) if not manual_hierarchy_flag and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) elif nodes and manual_hierarchy_flag: logging.warning('Using manual depth hierarchy.') depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker] if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + depth, node)) else: m_stack.add(1 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!' ' ({0} nodes but {1} provided)'.format(len(nodes), len(depths))) elif nodes and not manual_hierarchy_flag: logging.warning( 'Could not determine depth when parsing {0}:\n{1}'.format( section_no_without_marker, [n.label[0] for n in nodes])) for node in nodes: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((3, node)) else: m_stack.add(3, node) nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] # Collect paragraph markers and section text (intro text for the # section) for ch in filter(lambda ch: ch.tag in ('P', 'STARS'), section_xml.getchildren()): text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) m_stack = tree_utils.NodeStack() if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def process_inner_children(inner_stack, xml_node, parent=None): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" # manual hierarchy should work here too manual_hierarchy = [] try: part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0) part, section = part_and_section.split('.') part_and_section += '-Interp' if (part in PARAGRAPH_HIERARCHY and part_and_section in PARAGRAPH_HIERARCHY[part]): manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section] except Exception: pass children = itertools.takewhile(lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for i, xml_node in enumerate( filter(lambda c: c.tag in ('P', 'STARS'), children)): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) # If the node has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if xml_node.get("depth") is not None: manual_hierarchy.append(int(xml_node.get("depth"))) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes and manual_hierarchy: logging.warning("Couldn't determine interp marker. " "Manual hierarchy is specified") n = Node(node_text, label=[str(i)], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) elif not first_marker and not manual_hierarchy: logging.warning( "Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) if nodes: previous = nodes[-1] else: previous = parent previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments # use manual hierarchy if it's specified if not manual_hierarchy: depths = derive_depths([node.label[0] for node in nodes], [ rules.depth_type_order( [(mtypes.ints, mtypes.em_ints), (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman]) ]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + depth, node)) else: inner_stack.add(3 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!') elif nodes and not manual_hierarchy: logging.warning('Could not derive depth (interp):\n {}'.format( [node.label[0] for node in nodes])) # just add nodes in sequential order then for node in nodes: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3, node)) else: inner_stack.add(3, node)
def derive_nodes(self, xml, processor=None): # This should match HD elements only at lower levels, and for now we'll # just put them into the titles return [Node(text='', title=tree_utils.get_node_text(xml).strip(), label=[mtypes.MARKERLESS])]
def derive_nodes(self, xml, processor=None): return [Node(text=tree_utils.get_node_text(xml).strip(), label=[mtypes.MARKERLESS])]
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def test_get_node_text_no_tail(self): """get_node_text should not include any "tail" present (e.g. if processing part of a larger XML doc)""" xml = etree.fromstring("<root>Some <p>paragraph</p> w/ tail</root>") xml = xml.xpath("./p")[0] self.assertEqual(tree_utils.get_node_text(xml), 'paragraph')
def add_element(stack, xml_node, level=None): text = tree_utils.get_node_text(xml_node, add_spaces=True).strip() stack.add(level, TableHeaderNode(text, level))