Example #1
0
def build_section_by_section(sxs, fr_start_page, previous_label):
    """Given a list of xml nodes in the section by section analysis, pull
    out hierarchical data into a structure. Previous label is carried along to
    merge analyses of the same section."""
    structures = []
    while len(sxs):  # while sxs: is deprecated
        cfr_part = previous_label.split('-')[0]
        title, text_els, sub_sections, sxs = split_into_ttsr(sxs, cfr_part)

        page = find_page(title, title.sourceline, fr_start_page)
        paragraph_xmls = [deepcopy(el) for el in text_els
                          if el.tag == 'P' or el.tag == 'FP']
        footnotes = []
        for p_idx, paragraph_xml in enumerate(paragraph_xmls):
            spaces_then_remove(paragraph_xml, 'PRTPAGE')
            spaces_then_remove(paragraph_xml, 'FTREF')
            swap_emphasis_tags(paragraph_xml)
            # Anything inside a SU can also be ignored
            for su in paragraph_xml.xpath('./SU'):
                su_text = etree.tostring(su)
                footnotes.append({
                    'paragraph': p_idx,
                    'reference': su.text,
                    'offset': body_to_string(paragraph_xml).find(su_text)})
                if su.tail and su.getprevious() is not None:
                    su.getprevious().tail = (su.getprevious().tail or '')
                    su.getprevious().tail += su.tail
                elif su.tail:
                    su.getparent().text = (su.getparent().text or '')
                    su.getparent().text += su.tail
                su.getparent().remove(su)

        paragraphs = [body_to_string(el) for el in paragraph_xmls]
        label_for_children = previous_label
        labels = parse_into_labels(title.text, cfr_part)
        if labels:
            label_for_children = labels[-1]

        # recursively build children. Be sure to give them the proper label
        children = build_section_by_section(sub_sections, page,
                                            label_for_children)

        next_structure = {
            'page': page,
            'title': add_spaces_to_title(title.text),
            'paragraphs': paragraphs,
            'children': children,
            'footnote_refs': footnotes
            }

        if (labels and  # No label => subheader
                # Concatenate if repeat label or backtrack
                not all(label == previous_label or
                        is_backtrack(previous_label, label)
                        for label in labels)):
            previous_label = labels[-1]
            next_structure['labels'] = labels
        structures.append(next_structure)

    return structures
Example #2
0
def build_section_by_section(sxs, fr_start_page, previous_label):
    """Given a list of xml nodes in the section by section analysis, pull
    out hierarchical data into a structure. Previous label is carried along to
    merge analyses of the same section."""
    structures = []
    while len(sxs):  # while sxs: is deprecated
        cfr_part = previous_label.split('-')[0]
        title, text_els, sub_sections, sxs = split_into_ttsr(sxs, cfr_part)

        page = find_page(title, title.sourceline, fr_start_page)
        paragraph_xmls = [deepcopy(el) for el in text_els
                          if el.tag == 'P' or el.tag == 'FP']
        footnotes = []
        for p_idx, paragraph_xml in enumerate(paragraph_xmls):
            spaces_then_remove(paragraph_xml, 'PRTPAGE')
            spaces_then_remove(paragraph_xml, 'FTREF')
            swap_emphasis_tags(paragraph_xml)
            # Anything inside a SU can also be ignored
            for su in paragraph_xml.xpath('./SU'):
                su_text = etree.tounicode(su)
                footnotes.append({
                    'paragraph': p_idx,
                    'reference': su.text,
                    'offset': body_to_string(paragraph_xml).find(su_text)})
                if su.tail and su.getprevious() is not None:
                    su.getprevious().tail = (su.getprevious().tail or '')
                    su.getprevious().tail += su.tail
                elif su.tail:
                    su.getparent().text = (su.getparent().text or '')
                    su.getparent().text += su.tail
                su.getparent().remove(su)

        paragraphs = [body_to_string(el) for el in paragraph_xmls]
        label_for_children = previous_label
        labels = parse_into_labels(title.text, cfr_part)
        if labels:
            label_for_children = labels[-1]

        # recursively build children. Be sure to give them the proper label
        children = build_section_by_section(sub_sections, page,
                                            label_for_children)

        next_structure = {
            'page': page,
            'title': add_spaces_to_title(title.text),
            'paragraphs': paragraphs,
            'children': children,
            'footnote_refs': footnotes
        }

        if (labels and  # No label => subheader
                # Concatenate if repeat label or backtrack
                not all(label == previous_label or
                        is_backtrack(previous_label, label)
                        for label in labels)):
            previous_label = labels[-1]
            next_structure['labels'] = labels
        structures.append(next_structure)

    return structures
Example #3
0
def build_section_by_section(sxs, part, fr_start_page):
    """Given a list of xml nodes in the section by section analysis, pull
    out hierarchical data into a structure."""
    structures = []
    #while sxs: is deprecated
    while len(sxs):
        title, text_els, sub_sections, sxs = split_into_ttsr(sxs)

        page = find_page(title, title.sourceline, fr_start_page)
        paragraph_xmls = [deepcopy(el) for el in text_els if el.tag == 'P']
        footnotes = []
        for p_idx, paragraph_xml in enumerate(paragraph_xmls):
            spaces_then_remove(paragraph_xml, 'PRTPAGE')
            spaces_then_remove(paragraph_xml, 'FTREF')
            swap_emphasis_tags(paragraph_xml)
            # Anything inside a SU can also be ignored
            for su in paragraph_xml.xpath('./SU'):
                su_text = etree.tostring(su)
                footnotes.append({
                    'paragraph': p_idx,
                    'reference': su.text,
                    'offset': body_to_string(paragraph_xml).find(su_text)})
                if su.tail and su.getprevious() is not None:
                    su.getprevious().tail = (su.getprevious().tail or '')
                    su.getprevious().tail += su.tail
                elif su.tail:
                    su.getparent().text = (su.getparent().text or '')
                    su.getparent().text += su.tail
                su.getparent().remove(su)

        paragraphs = [body_to_string(el) for el in paragraph_xmls]
        children = build_section_by_section(sub_sections, part, page)

        next_structure = {
            'page': page,
            'title': add_spaces_to_title(title.text),
            'paragraphs': paragraphs,
            'children': children,
            'footnote_refs': footnotes
            }
        labels = parse_into_labels(title.text, part)
        if not labels:
            structures.append(next_structure)
        for label in labels:
            cp_structure = dict(next_structure) # shallow copy
            cp_structure['label'] = label
            structures.append(cp_structure)

    return structures
Example #4
0
def add_footnotes(notice, notice_xml):
    """ Parse the notice xml for footnotes and add them to the notice. """
    notice['footnotes'] = {}
    for child in notice_xml.xpath('//FTNT/*'):
        spaces_then_remove(child, 'PRTPAGE')
        swap_emphasis_tags(child)

        ref = child.xpath('.//SU')
        if ref:
            child.text = ref[0].tail
            child.remove(ref[0])
            content = child.text
            for cc in child:
                content += etree.tostring(cc)
            content += child.tail
            notice['footnotes'][ref[0].text] = content.strip()
Example #5
0
def add_footnotes(notice, notice_xml):
    """ Parse the notice xml for footnotes and add them to the notice. """
    notice['footnotes'] = {}
    for child in notice_xml.xpath('//FTNT/*'):
        spaces_then_remove(child, 'PRTPAGE')
        swap_emphasis_tags(child)

        ref = child.xpath('.//SU')
        if ref:
            child.text = ref[0].tail
            child.remove(ref[0])
            content = child.text
            for cc in child:
                content += etree.tostring(cc)
            content += child.tail
            notice['footnotes'][ref[0].text] = content.strip()
def build_supplement_tree(reg_part, node):
    """ Build the tree for the supplement section. """
    title = get_app_title(node)
    node = spaces_then_remove(deepcopy(node), 'PRTPAGE')
    root = Node(node_type=Node.INTERP,
                label=[reg_part, Node.INTERP_MARK],
                title=title)

    return parse_from_xml(root, node.getchildren())
def build_supplement_tree(reg_part, node):
    """ Build the tree for the supplement section. """
    title = get_app_title(node)
    node = spaces_then_remove(deepcopy(node), 'PRTPAGE')
    root = Node(
        node_type=Node.INTERP,
        label=[reg_part, Node.INTERP_MARK],
        title=title)

    return parse_from_xml(root, node.getchildren())
def standardize_xml(xml):
    """We will assume a format of Supplement I header followed by HDs,
    STARS, and Ps, so move anything in an EXTRACT up a level"""
    xml = spaces_then_remove(deepcopy(xml), 'PRTPAGE')
    for extract in xml.xpath(".//EXTRACT|.//APPENDIX|.//SUBPART"):
        ex_parent = extract.getparent()
        idx = ex_parent.index(extract)
        for child in extract:
            ex_parent.insert(idx, child)
            idx += 1
        ex_parent.remove(extract)
    return xml
def standardize_interp_xml(xml):
    """We will assume a format of Supplement I header followed by HDs,
    STARS, and Ps, so move anything in an EXTRACT up a level"""
    xml = util.spaces_then_remove(deepcopy(xml), 'PRTPAGE')
    for extract in xml.xpath(".//EXTRACT|.//APPENDIX|.//SUBPART"):
        ex_parent = extract.getparent()
        idx = ex_parent.index(extract)
        for child in extract:
            ex_parent.insert(idx, child)
            idx += 1
        ex_parent.remove(extract)
    return xml