def build_section_by_section(sxs, fr_start_page, previous_label): """Given a list of xml nodes in the section by section analysis, pull out hierarchical data into a structure. Previous label is carried along to merge analyses of the same section.""" structures = [] while len(sxs): # while sxs: is deprecated cfr_part = previous_label.split('-')[0] title, text_els, sub_sections, sxs = split_into_ttsr(sxs, cfr_part) page = find_page(title, title.sourceline, fr_start_page) paragraph_xmls = [deepcopy(el) for el in text_els if el.tag == 'P' or el.tag == 'FP'] footnotes = [] for p_idx, paragraph_xml in enumerate(paragraph_xmls): spaces_then_remove(paragraph_xml, 'PRTPAGE') spaces_then_remove(paragraph_xml, 'FTREF') swap_emphasis_tags(paragraph_xml) # Anything inside a SU can also be ignored for su in paragraph_xml.xpath('./SU'): su_text = etree.tounicode(su) footnotes.append({ 'paragraph': p_idx, 'reference': su.text, 'offset': body_to_string(paragraph_xml).find(su_text)}) if su.tail and su.getprevious() is not None: su.getprevious().tail = (su.getprevious().tail or '') su.getprevious().tail += su.tail elif su.tail: su.getparent().text = (su.getparent().text or '') su.getparent().text += su.tail su.getparent().remove(su) paragraphs = [body_to_string(el) for el in paragraph_xmls] label_for_children = previous_label labels = parse_into_labels(title.text, cfr_part) if labels: label_for_children = labels[-1] # recursively build children. Be sure to give them the proper label children = build_section_by_section(sub_sections, page, label_for_children) next_structure = { 'page': page, 'title': add_spaces_to_title(title.text), 'paragraphs': paragraphs, 'children': children, 'footnote_refs': footnotes } if (labels and # No label => subheader # Concatenate if repeat label or backtrack not all(label == previous_label or is_backtrack(previous_label, label) for label in labels)): previous_label = labels[-1] next_structure['labels'] = labels structures.append(next_structure) return structures
def build_section_by_section(sxs, fr_start_page, previous_label): """Given a list of xml nodes in the section by section analysis, pull out hierarchical data into a structure. Previous label is carried along to merge analyses of the same section.""" structures = [] while len(sxs): # while sxs: is deprecated cfr_part = previous_label.split('-')[0] title, text_els, sub_sections, sxs = split_into_ttsr(sxs, cfr_part) page = find_page(title, title.sourceline, fr_start_page) paragraph_xmls = [deepcopy(el) for el in text_els if el.tag == 'P' or el.tag == 'FP'] footnotes = [] for p_idx, paragraph_xml in enumerate(paragraph_xmls): spaces_then_remove(paragraph_xml, 'PRTPAGE') spaces_then_remove(paragraph_xml, 'FTREF') swap_emphasis_tags(paragraph_xml) # Anything inside a SU can also be ignored for su in paragraph_xml.xpath('./SU'): su_text = etree.tostring(su) footnotes.append({ 'paragraph': p_idx, 'reference': su.text, 'offset': body_to_string(paragraph_xml).find(su_text)}) if su.tail and su.getprevious() is not None: su.getprevious().tail = (su.getprevious().tail or '') su.getprevious().tail += su.tail elif su.tail: su.getparent().text = (su.getparent().text or '') su.getparent().text += su.tail su.getparent().remove(su) paragraphs = [body_to_string(el) for el in paragraph_xmls] label_for_children = previous_label labels = parse_into_labels(title.text, cfr_part) if labels: label_for_children = labels[-1] # recursively build children. Be sure to give them the proper label children = build_section_by_section(sub_sections, page, label_for_children) next_structure = { 'page': page, 'title': add_spaces_to_title(title.text), 'paragraphs': paragraphs, 'children': children, 'footnote_refs': footnotes } if (labels and # No label => subheader # Concatenate if repeat label or backtrack not all(label == previous_label or is_backtrack(previous_label, label) for label in labels)): previous_label = labels[-1] next_structure['labels'] = labels structures.append(next_structure) return structures
def build_section_by_section(sxs, part, fr_start_page): """Given a list of xml nodes in the section by section analysis, pull out hierarchical data into a structure.""" structures = [] #while sxs: is deprecated while len(sxs): title, text_els, sub_sections, sxs = split_into_ttsr(sxs) page = find_page(title, title.sourceline, fr_start_page) paragraph_xmls = [deepcopy(el) for el in text_els if el.tag == 'P'] footnotes = [] for p_idx, paragraph_xml in enumerate(paragraph_xmls): spaces_then_remove(paragraph_xml, 'PRTPAGE') spaces_then_remove(paragraph_xml, 'FTREF') swap_emphasis_tags(paragraph_xml) # Anything inside a SU can also be ignored for su in paragraph_xml.xpath('./SU'): su_text = etree.tostring(su) footnotes.append({ 'paragraph': p_idx, 'reference': su.text, 'offset': body_to_string(paragraph_xml).find(su_text)}) if su.tail and su.getprevious() is not None: su.getprevious().tail = (su.getprevious().tail or '') su.getprevious().tail += su.tail elif su.tail: su.getparent().text = (su.getparent().text or '') su.getparent().text += su.tail su.getparent().remove(su) paragraphs = [body_to_string(el) for el in paragraph_xmls] children = build_section_by_section(sub_sections, part, page) next_structure = { 'page': page, 'title': add_spaces_to_title(title.text), 'paragraphs': paragraphs, 'children': children, 'footnote_refs': footnotes } labels = parse_into_labels(title.text, part) if not labels: structures.append(next_structure) for label in labels: cp_structure = dict(next_structure) # shallow copy cp_structure['label'] = label structures.append(cp_structure) return structures