Ejemplo n.º 1
0
def build_external_citations_layer(root):
    """
    Build the external citations layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of external citations, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    paragraphs = root.findall('.//{eregs}paragraph')
    layer_dict = OrderedDict()

    for paragraph in paragraphs:
        marker = paragraph.get('marker')
        par_text = marker + ' ' + xml_node_text(
            paragraph.find('{eregs}content'))
        par_label = paragraph.get('label')
        cites = paragraph.findall('.//{eregs}ref[@reftype="external"]')
        citation_list = []
        for cite in cites:
            target = cite.get('target').split(':')
            citation_type = target[0]
            try:
                citation_target = target[1].split('-')
            except IndexError as e:
                print(
                    "Error in external citations: '{}' is not formatted properly. "
                    .format(target),
                    "Look for an empty or malformed target in a reftype=\"external\"."
                )
                raise e
            text = cite.text
            positions = find_all_occurrences(par_text, text)
            cite_dict = OrderedDict()
            cite_dict['citation'] = citation_target
            cite_dict['citation_type'] = citation_type
            cite_dict['offsets'] = []
            for pos in positions:
                cite_dict['offsets'].append([pos, pos + len(text)])

            if cite_dict not in citation_list and cite_dict['offsets'] != []:
                citation_list.append(cite_dict)

        if citation_list != []:
            layer_dict[par_label] = citation_list

    return layer_dict
Ejemplo n.º 2
0
def build_external_citations_layer(root):
    """
    Build the external citations layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of external citations, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    paragraphs = root.findall('.//{eregs}paragraph')
    layer_dict = OrderedDict()

    for paragraph in paragraphs:
        marker = paragraph.get('marker')
        par_text = marker + ' ' + xml_node_text(
            paragraph.find('{eregs}content'))
        par_label = paragraph.get('label')
        cites = paragraph.findall('.//{eregs}ref[@reftype="external"]')
        citation_list = []
        for cite in cites:
            target = cite.get('target').split(':')
            citation_type = target[0]
            try:
                citation_target = target[1].split('-')
            except IndexError as e:
                print("Error in external citations: '{}' is not formatted properly. ".format(target),
                      "Look for an empty or malformed target in a reftype=\"external\".")
                raise e
            text = cite.text
            positions = find_all_occurrences(par_text, text)
            cite_dict = OrderedDict()
            cite_dict['citation'] = citation_target
            cite_dict['citation_type'] = citation_type
            cite_dict['offsets'] = []
            for pos in positions:
                cite_dict['offsets'].append([pos, pos + len(text)])

            if cite_dict not in citation_list and cite_dict['offsets'] != []:
                citation_list.append(cite_dict)

        if citation_list != []:
            layer_dict[par_label] = citation_list

    return layer_dict
Ejemplo n.º 3
0
def build_terms_layer(root):
    """
    Build the terms layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of terms, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    definitions_dict = OrderedDict()
    terms_dict = OrderedDict()

    inf_engine = inflect.engine()
    inf_engine.defnoun('bonus', 'bonuses')

    paragraphs = root.findall('.//{eregs}paragraph') + \
        root.findall('.//{eregs}interpParagraph')

    definitions = root.findall('.//{eregs}def')

    paragraphs_with_defs = [
        par for par in paragraphs if par.find('{eregs}content') is not None
        and par.find('{eregs}content').find('{eregs}def') is not None
    ]

    for paragraph in paragraphs_with_defs:
        label = paragraph.get('label')
        marker = paragraph.get('marker') or ''
        title = paragraph.find('{eregs}title')
        content = apply_formatting(paragraph.find('{eregs}content'))
        par_text = xml_node_text(content).strip()
        definitions = content.findall('{eregs}def')

        total_offset = get_offset(paragraph, marker, title)

        for defn in definitions:
            defined_term = defn.get('term')
            if inf_engine.singular_noun(defined_term.lower()) and not \
                    defined_term.lower() in settings.SPECIAL_SINGULAR_NOUNS:
                key = inf_engine.singular_noun(defined_term.lower()) + \
                    ':' + label
            else:
                key = defined_term.lower() + ':' + label

            def_text = defn.text
            positions = find_all_occurrences(par_text, def_text)
            def_dict = OrderedDict()
            pos = positions[0]
            def_dict['position'] = [
                pos + total_offset, pos + len(def_text) + total_offset
            ]
            def_dict['reference'] = label
            def_dict['term'] = defined_term
            if def_dict['position'] != []:
                definitions_dict[key] = def_dict

    for paragraph in paragraphs:
        content = apply_formatting(paragraph.find('{eregs}content'))
        terms = content.findall('.//{eregs}ref[@reftype="term"]')
        title = paragraph.find('{eregs}title')
        marker = paragraph.get('marker') or ''

        label = paragraph.get('label')
        # If this is a subparagraph of a type that wants an intro paragraph
        # and this paragraph is intro text, set the paragraph's label to reference
        # the parent's
        if wants_intro_text(
                paragraph.getparent()) and is_intro_text(paragraph):
            # This intro paragraph will get attached to its parent node by
            # build_reg_tree
            label = paragraph.getparent().get('label')

        if len(terms) > 0:
            terms_dict[label] = []

        total_offset = get_offset(paragraph, marker, title)

        term_positions = OrderedDict()
        term_targets = OrderedDict()

        for term in terms:
            running_par_text = content.text or ''
            for child in content.getchildren():
                if child != term:
                    tail = child.tail or ''
                    running_par_text += child.text + tail
                else:
                    break

            text = term.text
            target = term.get('target')
            defn_location = [
                key for key, defn in definitions_dict.items()
                if defn['reference'] == target
            ]
            if len(defn_location) > 0:
                defn_location = defn_location[0]
                term_position = len(running_par_text) + total_offset
                term_positions.setdefault(text, []).append(term_position)
                term_targets[text] = defn_location

        for term, positions in term_positions.items():
            target = term_targets[term]
            ref_dict = OrderedDict()
            ref_dict['offsets'] = []
            for pos in positions:
                ref_dict['offsets'].append([pos, pos + len(term)])
            ref_dict['ref'] = target
            if len(ref_dict['offsets']) > 0 and \
                    ref_dict not in terms_dict[label]:
                terms_dict[label].append(ref_dict)

    terms_dict['referenced'] = definitions_dict

    return terms_dict
Ejemplo n.º 4
0
def apply_formatting(content_elm):
    """
    Applies special inline formatting to variables and callouts, as expected by
    the frontend formatting layer.

    :param content_elm: The ``<content>`` element to which the inline formatting is to be applied.
    :type content_elm: :class:`etree.Element`

    :return: the element with the inline formatting applied.
    :rtype: :class:`etree.Element`:
    """

    working_content = deepcopy(content_elm)

    # Before building the content text, replace any variable
    # elements with Var_{sub} so that reg-site will know what to
    # do with them.
    variables = working_content.findall('{eregs}variable') or []
    for variable in variables:
        # Note: lxml/etree API makes this a lot harder than it
        # should be by use text/tail instead of text nodes.
        subscript = variable.find('{eregs}subscript')
        replacement_text = '{var}_{{{sub}}}'.format(var=variable.text,
                                                    sub=subscript.text)
        if variable.tail is not None:
            replacement_text += variable.tail

        # If there's a previous node, simply append the text to its
        # tail.
        if variable.getprevious() is not None:
            previous = variable.getprevious()
            if previous.tail is None:
                previous.tail = ''
            previous.tail += replacement_text

        # Otherwise, operate on the parent
        else:
            v_parent = variable.getparent()
            if v_parent.text is None:
                v_parent.text = ''
            v_parent.text += replacement_text

        # Remove the variable node
        variable.getparent().remove(variable)

    # Do the same for callouts
    callouts = working_content.findall('.//{eregs}callout')
    for callout in callouts:
        lines = callout.findall('{eregs}line')
        callout_text = xml_node_text(callout).strip()
        # Callouts *should* be the only things within the content
        # element of a paragraph. Assume that.
        callout.getparent().remove(callout)
        if callout.get('type') == 'note':
            working_content.text = xml_node_text(callout).strip()
        elif callout.get('type') == 'code':
            working_content.text = '```\n' + \
                        '\n'.join([l.text for l in lines]) + \
                        '```'

    # Do the same for dashes.
    dashes = working_content.findall('.//{eregs}dash')
    for dash in dashes:
        # Dashes have to end a line, so we ignore the dash's tail
        dash_text = dash.text
        if dash_text is None:
            dash_text = ''

        dash_text = dash_text + '_____'

        # Append the dash_text to either parent or previous sibling to
        # replace the dash element.
        previous = dash.getprevious()
        if previous is not None:
            previous.tail = (previous.tail or '') + dash_text
        else:
            working_content.text = (working_content.text or '') + dash_text
        working_content.remove(dash)

    return working_content
Ejemplo n.º 5
0
def build_formatting_layer(root):
    """
    Build the formatting layer from the provided root of the XML tree. Formatting elements include
    things like callouts, tables, lines indicating spaces on a form, and so on.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of formatting elements, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    layer_dict = OrderedDict()
    paragraphs = root.findall('.//{eregs}paragraph') + \
        root.findall('.//{eregs}interpParagraph')

    for paragraph in paragraphs:
        content = paragraph.find('{eregs}content')
        dashes = content.findall('.//{eregs}dash')
        tables = content.findall('.//{eregs}table')
        variables = content.findall('.//{eregs}variable')
        callouts = content.findall('.//{eregs}callout')
        label = paragraph.get('label')

        if len(dashes) > 0:
            layer_dict[label] = []
            for dash in dashes:
                dash_dict = OrderedDict()
                dash_text = dash.text
                if dash_text is None:
                    dash_text = ''
                dash_dict['dash_data'] = {'text': dash_text}
                dash_dict['locations'] = [0]
                dash_dict['text'] = dash_text + '_____'
                layer_dict[label].append(dash_dict)

        if len(variables) > 0:
            if label not in layer_dict:
                layer_dict[label] = []

            for variable in variables:
                subscript = variable.find('{eregs}subscript')
                var_dict = OrderedDict()
                var_dict['subscript_data'] = {
                    'variable': variable.text,
                    'subscript': subscript.text,
                }
                var_dict['locations'] = [0]
                var_dict['text'] = '{var}_{{{sub}}}'.format(var=variable.text,
                                                            sub=subscript.text)
                layer_dict[label].append(var_dict)

        if len(callouts) > 0:
            if label not in layer_dict:
                layer_dict[label] = []

            for callout in callouts:
                lines = callout.findall('{eregs}line')
                callout_dict = OrderedDict()
                callout_dict['fence_data'] = {
                    'lines': [l.text for l in lines],
                    'type': callout.get('type')
                }
                callout_dict['locations'] = [0]
                if callout.get('type') == 'note':
                    callout_dict['text'] = xml_node_text(callout).strip()
                elif callout.get('type') == 'code':
                    callout_dict['text'] = '```\n' + \
                        '\n'.join([l.text for l in lines]) + \
                        '```'
                layer_dict[label].append(callout_dict)

        if len(tables) > 0:
            if label not in layer_dict:
                layer_dict[label] = []
            for table in tables:
                table_md = '|'
                table_dict = OrderedDict()
                table_data_dict = OrderedDict()
                table_data_dict['header'] = []
                table_data_dict['rows'] = []
                table_dict['locations'] = [0]
                header = table.find('{eregs}header')
                header_rows = header.findall('{eregs}columnHeaderRow')
                for column_header in header_rows:
                    columns = column_header.findall('{eregs}column')
                    column_arr = []
                    for column in columns:
                        column_header_dict = OrderedDict()
                        column_header_dict['colspan'] = int(
                            column.get('colspan'))
                        column_header_dict['rowspan'] = int(
                            column.get('rowspan'))
                        column_text = column.text
                        if column_text is None:
                            column_text = ''
                        column_header_dict['text'] = column_text
                        table_md += column_text + '|'
                        column_arr.append(column_header_dict)
                    table_data_dict['header'].append(column_arr)
                    table_md += '\n|'

                data_rows = table.findall('{eregs}row')
                for i, row in enumerate(data_rows):
                    row_arr = []
                    cells = row.findall('{eregs}cell')
                    for cell in cells:
                        cell_text = cell.text
                        if cell_text is None:
                            cell_text = ''
                        row_arr.append(cell_text)
                        table_md += cell_text + '|'
                    table_data_dict['rows'].append(row_arr)
                    if i < len(data_rows) - 1:
                        table_md += '\n|'
                table_dict['table_data'] = table_data_dict
                table_dict['text'] = ''
                layer_dict[label].append(table_dict)

    return layer_dict
Ejemplo n.º 6
0
def build_reg_tree(root, parent=None, depth=0):
    """
    This function builds the basic JSON regulation tree recursively from the supplied
    root element of the XML.

    :param root: The XML root. If this function is called from the outside, the root
        should be the very top of the tree, i.e. the ``<regulation>`` element.
    :type root: :class:`etree.Element`
    :param parent: The parent of the current element. None if the root is the
        ``<regulation>`` element.
    :type parent: :class:`etree.Element`
    :param depth: The depth at which the current element resides.
    :type depth: :class:`int`

    :return: The top node of the resulting tree.
    :rtype: :class:`regulation.node.RegNode`
    """

    ns_prefix = '{eregs}'
    tag = root.tag.replace(ns_prefix, '')
    node = RegNode(include_children=True)

    if tag == 'regulation':
        preamble = root.find(ns_prefix + 'preamble')
        section = preamble.find('{eregs}cfr/{eregs}section').text
        fdsys = root.find(ns_prefix + 'fdsys')
        title = fdsys.find(ns_prefix + 'title').text

        node.label = [section]
        node.marker = None
        node.node_type = 'regtext'
        node.title = title

        subparts = root.findall('.//{eregs}subpart')
        appendices = root.findall('.//{eregs}appendix')
        interpretations = root.findall('.//{eregs}interpretations')

        children = subparts + appendices + interpretations

    elif tag == 'subpart':
        title = root.find(ns_prefix + 'title')
        if title is not None:
            node.node_type = 'subpart'
            node.title = title.text
            node.label = parent.label + ['Subpart', root.get('subpartLetter')]
        else:
            node.node_type = 'emptypart'
            node.title = ''
            node.label = parent.label + ['Subpart']

        node.text = ''

        content = root.find('{eregs}content')
        children = content.findall('{eregs}section')

    elif tag == 'section' and root.attrib != {}:
        subject = root.find(ns_prefix + 'subject')
        label = root.get('label').split('-')
        node.title = subject.text
        node.node_type = 'regtext'
        node.label = label

        children = root.findall('{eregs}paragraph')

        # Check to see if the first child is an unmarked intro
        # paragraph. Reg-site expects those to be be the 'text' of this
        # node rather than child nodes in their own right.
        if len(children) > 0:
            first_child = children[0]
            # First_child may be an intro paragraph
            if is_intro_text(first_child):
                content = xml_node_text(first_child.find('{eregs}content'))
                node.text = content.strip()
                del children[0]

    elif tag == 'paragraph':
        title = root.find('{eregs}title')
        content = apply_formatting(root.find('{eregs}content'))
        content_text = xml_node_text(content)

        if title is not None:
            if title.get('type') != 'keyterm':
                node.title = title.text
            else:
                # Keyterms are expected by reg-site to be included in
                # the content text rather than the title of a node.
                content_text = title.text + content_text

        node.marker = root.get('marker')
        if node.marker == 'none':
            marker = ''
        else:
            marker = node.marker

        node.label = root.get('label').split('-')

        graphic = content.find('{eregs}graphic')
        if graphic is not None:
            node.text = graphic.find('{eregs}text').text
        else:
            node.text = '{} {}'.format(marker, content_text).strip()

        node.node_type = parent.node_type
        node.mixed_text = xml_mixed_text(content)
        node.source_xml = root

        children = root.findall('{eregs}paragraph')

    elif tag == 'appendix':

        title = root.find('{eregs}appendixTitle')
        label = root.get('label').split('-')
        node.node_type = 'appendix'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}appendixSection')

    elif tag == 'appendixSection':

        subject = root.find('{eregs}subject')
        label = root.get('label').split('-')
        node.node_type = 'appendix'
        node.text = ''
        node.title = subject.text
        node.label = label

        children = root.findall('{eregs}paragraph')

        # Check to see if the first child is an unmarked intro
        # paragraph. Reg-site expects those to be be the 'text' of this
        # node rather than child nodes in their own right.
        if len(children) > 0:
            first_child = children[0]
            # First_child may be an intro paragraph
            if is_intro_text(first_child):
                content = xml_node_text(first_child.find('{eregs}content'))
                node.text = content.strip()
                del children[0]

    elif tag == 'interpretations':

        title = root.find('{eregs}title')
        label = root.get('label').split('-')
        node.node_type = 'interp'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}interpSection')

    elif tag == 'interpSection' or tag == 'interpAppSection':

        title = root.find('{eregs}title')
        label = root.get('label').split('-')
        node.node_type = 'interp'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}interpParagraph')

    elif tag == 'interpAppendix':

        title = root.find('{eregs}title')
        label = root.get('label').split('-')
        node.node_type = 'interp'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}interpAppSection')

    elif tag == 'interpParagraph':

        title = root.find('{eregs}title')
        content = apply_formatting(root.find('{eregs}content'))
        content_text = xml_node_text(content)

        if title is not None:
            if title.get('type') != 'keyterm':
                node.title = title.text
            else:
                # Keyterms are expected by reg-site to be included in
                # the content text rather than the title of a node.
                content_text = title.text + content_text

        node.marker = root.get('marker', '')
        if node.marker == 'none':
            node.marker = ''

        node.label = root.get('label').split('-')
        node.text = content_text
        node.node_type = 'interp'
        node.source_xml = root

        children = root.findall('{eregs}interpParagraph')

    else:
        children = []

    node.depth = depth

    for child in children:
        node.children.append(
            build_reg_tree(child, parent=node, depth=depth + 1))

    return node
Ejemplo n.º 7
0
def build_internal_citations_layer(root):
    """
    Build the internal citations layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of internal citations, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    paragraphs = root.findall('.//{eregs}paragraph') + root.findall(
        './/{eregs}interpParagraph')
    layer_dict = OrderedDict()

    for paragraph in paragraphs:
        marker = paragraph.get('marker', '')
        title = paragraph.find('{eregs}title')

        if marker == 'none' or marker is None:
            marker = ''
        par_text = (marker + ' ' +
                    xml_node_text(paragraph.find('{eregs}content'))).strip()

        par_label = paragraph.get('label')
        if wants_intro_text(
                paragraph.getparent()) and is_intro_text(paragraph):
            # This intro paragraph will get attached to its parent node by
            # build_reg_tree
            par_label = paragraph.getparent().get('label')

        total_offset = get_offset(paragraph, marker, title)

        cite_positions = OrderedDict()
        cite_targets = OrderedDict()

        content = apply_formatting(paragraph.find('{eregs}content'))
        cites = content.findall('{eregs}ref[@reftype="internal"]')
        citation_list = []
        for cite in cites:
            target = cite.get('target').split('-')
            text = cite.text

            running_par_text = content.text or ''
            for child in content.getchildren():
                if child != cite:
                    tail = child.tail or ''
                    running_par_text += (child.text or '') + tail
                else:
                    break

            cite_position = len(running_par_text) + total_offset
            cite_positions.setdefault(text, []).append(cite_position)
            cite_targets[text] = target
            running_par_text = ''

        for cite, positions in cite_positions.items():
            # positions = find_all_occurrences(par_text, text)
            for pos in positions:
                # Handle empty citations
                try:
                    cite_dict = {
                        'citation': cite_targets[cite],
                        'offsets': [[pos, pos + len(cite)]]
                    }
                except TypeError as e:
                    print("TypeError occurred: {}".format(str(e)))
                    print("Look for a reference without text in {} @ pos {}".
                          format(par_label, positions))
                    raise e

                if cite_dict not in citation_list:
                    citation_list.append(cite_dict)

        if citation_list != []:
            layer_dict[par_label] = citation_list

    return layer_dict
Ejemplo n.º 8
0
def build_terms_layer(root):
    """
    Build the terms layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of terms, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    definitions_dict = OrderedDict()
    terms_dict = OrderedDict()

    inf_engine = inflect.engine()
    inf_engine.defnoun('bonus', 'bonuses')

    paragraphs = root.findall('.//{eregs}paragraph') + \
        root.findall('.//{eregs}interpParagraph')

    definitions = root.findall('.//{eregs}def')

    paragraphs_with_defs = [par for par in paragraphs if par.find('{eregs}content') is not None
                            and par.find('{eregs}content').find('{eregs}def') is not None]

    for paragraph in paragraphs_with_defs:
        label = paragraph.get('label')
        marker = paragraph.get('marker') or ''
        content = apply_formatting(paragraph.find('{eregs}content'))
        par_text = (marker + ' ' + xml_node_text(content)).strip()
        definitions = content.findall('{eregs}def')

        for defn in definitions:
            defined_term = defn.get('term')
            if inf_engine.singular_noun(defined_term.lower()) and not \
                    defined_term.lower() in settings.SPECIAL_SINGULAR_NOUNS:
                key = inf_engine.singular_noun(defined_term.lower()) + \
                    ':' + label
            else:
                key = defined_term.lower() + ':' + label

            def_text = defn.text
            positions = find_all_occurrences(par_text, def_text)
            def_dict = OrderedDict()
            pos = positions[0]
            def_dict['position'] = [pos, pos + len(def_text)]
            def_dict['reference'] = label
            def_dict['term'] = defined_term
            if def_dict['position'] != []:
                definitions_dict[key] = def_dict

    for paragraph in paragraphs:
        content = apply_formatting(paragraph.find('{eregs}content'))
        terms = content.findall('.//{eregs}ref[@reftype="term"]')
        title = paragraph.find('{eregs}title')
        marker = paragraph.get('marker') or ''

        label = paragraph.get('label')
        if wants_intro_text(paragraph.getparent()) and is_intro_text(paragraph):
            # This intro paragraph will get attached to its parent node by
            # build_reg_text
            label = paragraph.getparent().get('label')

        if len(terms) > 0:
            terms_dict[label] = []

        if marker != '' and paragraph.tag != '{eregs}interpParagraph':
            marker_offset = len(marker + ' ')
        else:
            marker_offset = 0

        # Keyterm offset.
        # Note: reg-site treats interp-paragraphs as "special" — they
        # don't get the keyterm text included, so we don't include an
        # offset here.
        if title is not None and title.get('type') == 'keyterm' and \
                paragraph.tag != '{eregs}interpParagraph':
            keyterm_offset = len(title.text)
        else:
            keyterm_offset = 0

        term_positions = OrderedDict()
        term_targets = OrderedDict()

        for term in terms:
            running_par_text = content.text or ''
            for child in content.getchildren():
                if child != term:
                    tail = child.tail or ''
                    running_par_text += child.text + tail
                else:
                    break

            text = term.text
            target = term.get('target')
            defn_location = [key for key, defn in definitions_dict.items() if defn['reference'] == target]
            if len(defn_location) > 0:
                defn_location = defn_location[0]
                term_position = len(running_par_text) + marker_offset + keyterm_offset
                term_positions.setdefault(text, []).append(term_position)
                term_targets[text] = defn_location

        for term, positions in term_positions.items():
            target = term_targets[term]
            ref_dict = OrderedDict()
            ref_dict['offsets'] = []
            for pos in positions:
                ref_dict['offsets'].append([pos, pos + len(term)])
            ref_dict['ref'] = target
            if len(ref_dict['offsets']) > 0 and \
                    ref_dict not in terms_dict[label]:
                terms_dict[label].append(ref_dict)

    terms_dict['referenced'] = definitions_dict

    return terms_dict
Ejemplo n.º 9
0
def apply_formatting(content_elm):
    """
    Applies special inline formatting to variables and callouts, as expected by
    the frontend formatting layer.

    :param content_elm: The ``<content>`` element to which the inline formatting is to be applied.
    :type content_elm: :class:`etree.Element`

    :return: the element with the inline formatting applied.
    :rtype: :class:`etree.Element`:
    """

    working_content = deepcopy(content_elm)

    # Before building the content text, replace any variable
    # elements with Var_{sub} so that reg-site will know what to
    # do with them.
    variables = working_content.findall('{eregs}variable') or []
    for variable in variables:
        # Note: lxml/etree API makes this a lot harder than it
        # should be by use text/tail instead of text nodes.
        subscript = variable.find('{eregs}subscript')
        replacement_text = '{var}_{{{sub}}}'.format(
            var=variable.text, sub=subscript.text)
        if variable.tail is not None:
            replacement_text += variable.tail

        # If there's a previous node, simply append the text to its
        # tail.
        if variable.getprevious() is not None:
            previous = variable.getprevious()
            if previous.tail is None:
                previous.tail = ''
            previous.tail += replacement_text

        # Otherwise, operate on the parent
        else:
            v_parent = variable.getparent()
            if v_parent.text is None:
                v_parent.text = ''
            v_parent.text += replacement_text

        # Remove the variable node
        variable.getparent().remove(variable)

    # Do the same for callouts
    callouts = working_content.findall('.//{eregs}callout')
    for callout in callouts:
        lines = callout.findall('{eregs}line')
        callout_text = xml_node_text(callout).strip()
        # Callouts *should* be the only things within the content
        # element of a paragraph. Assume that.
        callout.getparent().remove(callout)
        if callout.get('type') == 'note':
            working_content.text = xml_node_text(callout).strip()
        elif callout.get('type') == 'code':
            working_content.text = '```\n' + \
                        '\n'.join([l.text for l in lines]) + \
                        '```'

    return working_content
Ejemplo n.º 10
0
def build_formatting_layer(root):
    """
    Build the formatting layer from the provided root of the XML tree. Formatting elements include
    things like callouts, tables, lines indicating spaces on a form, and so on.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of formatting elements, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    layer_dict = OrderedDict()
    paragraphs = root.findall('.//{eregs}paragraph') + \
        root.findall('.//{eregs}interpParagraph')

    for paragraph in paragraphs:
        content = paragraph.find('{eregs}content')
        dashes = content.findall('.//{eregs}dash')
        tables = content.findall('.//{eregs}table')
        variables = content.findall('.//{eregs}variable')
        callouts = content.findall('.//{eregs}callout')
        label = paragraph.get('label')

        if len(dashes) > 0:
            layer_dict[label] = []
            for dash in dashes:
                dash_dict = OrderedDict()
                dash_text = dash.text
                if dash_text is None:
                    dash_text = ''
                dash_dict['text'] = dash_text
                dash_dict['dash_data'] = {'text': dash_text}
                dash_dict['locations'] = [0]
                layer_dict[label].append(dash_dict)

        if len(variables) > 0:
            if label not in layer_dict:
                layer_dict[label] = []

            for variable in variables:
                subscript = variable.find('{eregs}subscript')
                var_dict = OrderedDict()
                var_dict['subscript_data'] = {
                    'variable': variable.text,
                    'subscript': subscript.text,
                }
                var_dict['locations'] = [0]
                var_dict['text'] = '{var}_{{{sub}}}'.format(
                        var=variable.text, sub=subscript.text)
                layer_dict[label].append(var_dict)

        if len(callouts) > 0:
            if label not in layer_dict:
                layer_dict[label] = []

            for callout in callouts:
                lines = callout.findall('{eregs}line')
                callout_dict = OrderedDict()
                callout_dict['fence_data'] = {
                    'lines': [l.text for l in lines],
                    'type': callout.get('type')
                }
                callout_dict['locations'] = [0]
                if callout.get('type') == 'note':
                    callout_dict['text'] = xml_node_text(callout).strip()
                elif callout.get('type') == 'code':
                    callout_dict['text'] = '```\n' + \
                        '\n'.join([l.text for l in lines]) + \
                        '```'
                layer_dict[label].append(callout_dict)

        if len(tables) > 0:
            if label not in layer_dict:
                layer_dict[label] = []
            for table in tables:
                table_md = '|'
                table_dict = OrderedDict()
                table_data_dict = OrderedDict()
                table_data_dict['header'] = []
                table_data_dict['rows'] = []
                table_dict['locations'] = [0]
                header = table.find('{eregs}header')
                header_rows = header.findall('{eregs}columnHeaderRow')
                for column_header in header_rows:
                    columns = column_header.findall('{eregs}column')
                    column_arr = []
                    for column in columns:
                        column_header_dict = OrderedDict()
                        column_header_dict['colspan'] = int(
                            column.get('colspan'))
                        column_header_dict['rowspan'] = int(
                            column.get('rowspan'))
                        column_text = column.text
                        if column_text is None:
                            column_text = ''
                        column_header_dict['text'] = column_text
                        table_md += column_text + '|'
                        column_arr.append(column_header_dict)
                    table_data_dict['header'].append(column_arr)
                    table_md += '\n|'

                data_rows = table.findall('{eregs}row')
                for i, row in enumerate(data_rows):
                    row_arr = []
                    cells = row.findall('{eregs}cell')
                    for cell in cells:
                        cell_text = cell.text
                        if cell_text is None:
                            cell_text = ''
                        row_arr.append(cell_text)
                        table_md += cell_text + '|'
                    table_data_dict['rows'].append(row_arr)
                    if i < len(data_rows) - 1:
                        table_md += '\n|'
                table_dict['table_data'] = table_data_dict
                table_dict['text'] = ''
                layer_dict[label].append(table_dict)

    return layer_dict
Ejemplo n.º 11
0
def build_reg_tree(root, parent=None, depth=0):
    """
    This function builds the basic JSON regulation tree recursively from the supplied
    root element of the XML.

    :param root: The XML root. If this function is called from the outside, the root
        should be the very top of the tree, i.e. the ``<regulation>`` element.
    :type root: :class:`etree.Element`
    :param parent: The parent of the current element. None if the root is the
        ``<regulation>`` element.
    :type parent: :class:`etree.Element`
    :param depth: The depth at which the current element resides.
    :type depth: :class:`int`

    :return: The top node of the resulting tree.
    :rtype: :class:`regulation.node.RegNode`
    """

    ns_prefix = '{eregs}'
    tag = root.tag.replace(ns_prefix, '')
    node = RegNode(include_children=True)

    if tag == 'regulation':
        preamble = root.find(ns_prefix + 'preamble')
        section = preamble.find('{eregs}cfr/{eregs}section').text
        fdsys = root.find(ns_prefix + 'fdsys')
        title = fdsys.find(ns_prefix + 'title').text

        node.label = [section]
        node.marker = None
        node.node_type = 'regtext'
        node.title = title

        subparts = root.findall('.//{eregs}subpart')
        appendices = root.findall('.//{eregs}appendix')
        interpretations = root.findall('.//{eregs}interpretations')

        children = subparts + appendices + interpretations

    elif tag == 'subpart':
        title = root.find(ns_prefix + 'title')
        if title is not None:
            node.node_type = 'subpart'
            node.title = title.text
            node.label = parent.label + ['Subpart', root.get('subpartLetter')]
        else:
            node.node_type = 'emptypart'
            node.title = ''
            node.label = parent.label + ['Subpart']

        node.text = ''

        content = root.find('{eregs}content')
        children = content.findall('{eregs}section')

    elif tag == 'section' and root.attrib != {}:
        subject = root.find(ns_prefix + 'subject')
        label = root.get('label').split('-')
        node.title = subject.text
        node.node_type = 'regtext'
        node.label = label

        children = root.findall('{eregs}paragraph')

        # Check to see if the first child is an unmarked intro
        # paragraph. Reg-site expects those to be be the 'text' of this
        # node rather than child nodes in their own right.
        if len(children) > 0:
            first_child = children[0]
            # First_child may be an intro paragraph
            if is_intro_text(first_child):
                content = xml_node_text(first_child.find('{eregs}content'))
                node.text = content.strip()
                del children[0]

    elif tag == 'paragraph':
        title = root.find('{eregs}title')
        content = apply_formatting(root.find('{eregs}content'))
        content_text = xml_node_text(content)

        if title is not None:
            if title.get('type') != 'keyterm':
                node.title = title.text
            else:
                # Keyterms are expected by reg-site to be included in
                # the content text rather than the title of a node.
                content_text = title.text + content_text

        node.marker = root.get('marker')
        if node.marker == 'none':
            marker = ''
        else:
            marker = node.marker

        node.label = root.get('label').split('-')

        graphic = content.find('{eregs}graphic')
        if graphic is not None:
            node.text = graphic.find('{eregs}text').text
        else:
            node.text = '{} {}'.format(marker, content_text).strip()

        node.node_type = parent.node_type
        node.mixed_text = xml_mixed_text(content)
        node.source_xml = etree.tostring(root, encoding='UTF-8')

        children = root.findall('{eregs}paragraph')

    elif tag == 'appendix':

        title = root.find('{eregs}appendixTitle')
        label = root.get('label').split('-')
        node.node_type = 'appendix'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}appendixSection')

    elif tag == 'appendixSection':

        subject = root.find('{eregs}subject')
        label = root.get('label').split('-')
        node.node_type = 'appendix'
        node.text = ''
        node.title = subject.text
        node.label = label

        children = root.findall('{eregs}paragraph')

        # Check to see if the first child is an unmarked intro
        # paragraph. Reg-site expects those to be be the 'text' of this
        # node rather than child nodes in their own right.
        if len(children) > 0:
            first_child = children[0]
            # First_child may be an intro paragraph
            if is_intro_text(first_child):
                content = xml_node_text(first_child.find('{eregs}content'))
                node.text = content.strip()
                del children[0]

    elif tag == 'interpretations':

        title = root.find('{eregs}title')
        label = root.get('label').split('-')
        node.node_type = 'interp'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}interpSection')

    elif tag == 'interpSection' or tag == 'interpAppSection':

        title = root.find('{eregs}title')
        label = root.get('label').split('-')
        node.node_type = 'interp'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}interpParagraph')

    elif tag == 'interpAppendix':

        title = root.find('{eregs}title')
        label = root.get('label').split('-')
        node.node_type = 'interp'
        node.text = ''
        node.title = title.text
        node.label = label

        children = root.findall('{eregs}interpAppSection')

    elif tag == 'interpParagraph':

        title = root.find('{eregs}title')
        content = apply_formatting(root.find('{eregs}content'))
        content_text = xml_node_text(content)
        if title is not None:
            node.title = title.text

        node.marker = root.get('marker', '')
        if node.marker == 'none':
            node.marker = ''

        node.label = root.get('label').split('-')
        node.text = content_text
        node.node_type = 'interp'
        node.source_xml = etree.tostring(root, encoding='UTF-8')

        children = root.findall('{eregs}interpParagraph')

    else:
        children = []

    node.depth = depth

    for child in children:
        node.children.append(build_reg_tree(child, parent=node, depth=depth+1))

    return node
Ejemplo n.º 12
0
def build_internal_citations_layer(root):
    """
    Build the internal citations layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of internal citations, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    paragraphs = root.findall('.//{eregs}paragraph') + root.findall('.//{eregs}interpParagraph')
    layer_dict = OrderedDict()

    for paragraph in paragraphs:
        marker = paragraph.get('marker', '')
        title = paragraph.find('{eregs}title')

        if marker == 'none' or marker is None:
            marker = ''
        par_text = (marker + ' ' + xml_node_text(
            paragraph.find('{eregs}content'))).strip()

        par_label = paragraph.get('label')
        if wants_intro_text(paragraph.getparent()) and is_intro_text(paragraph):
            # This intro paragraph will get attached to its parent node by
            # build_reg_text
            par_label = paragraph.getparent().get('label')

        if marker != '' and paragraph.tag != '{eregs}interpParagraph':
            marker_offset = len(marker + ' ')
        else:
            marker_offset = 0

        # Keyterm offset.
        # Note: reg-site treats interp-paragraphs as "special" — they
        # don't get the keyterm text included, so we don't include an
        # offset here.
        if title is not None and title.get('type') == 'keyterm' and \
                paragraph.tag != '{eregs}interpParagraph':
            keyterm_offset = len(title.text)
        else:
            keyterm_offset = 0

        cite_positions = OrderedDict()
        cite_targets = OrderedDict()

        content = apply_formatting(paragraph.find('{eregs}content'))
        cites = content.findall('{eregs}ref[@reftype="internal"]')
        citation_list = []
        for cite in cites:
            target = cite.get('target').split('-')
            text = cite.text

            running_par_text = content.text or ''
            for child in content.getchildren():
                if child != cite:
                    tail = child.tail or ''
                    running_par_text += (child.text or '') + tail
                else:
                    break

            cite_position = len(running_par_text) + marker_offset + keyterm_offset
            cite_positions.setdefault(text, []).append(cite_position)
            cite_targets[text] = target
            running_par_text = ''

        for cite, positions in cite_positions.items():
            # positions = find_all_occurrences(par_text, text)
            for pos in positions:
                #print cite, positions, par_label
                cite_dict = {'citation': cite_targets[cite],
                             'offsets': [[pos, pos + len(cite)]]}
                if cite_dict not in citation_list:
                    citation_list.append(cite_dict)

        if citation_list != []:
            layer_dict[par_label] = citation_list

    return layer_dict
Ejemplo n.º 13
0
def build_internal_citations_layer(root):
    """
    Build the internal citations layer from the provided root of the XML tree.

    :param root: The root element of the XML tree.
    :type root: :class:`etree.Element`

    :return: An OrderedDict containing the locations of internal citations, suitable for direct
        transformation into JSON for use with the eRegs frontend.
    :rtype: :class:`collections.OrderedDict`:
    """

    paragraphs = root.findall('.//{eregs}paragraph') + root.findall('.//{eregs}interpParagraph')
    layer_dict = OrderedDict()

    for paragraph in paragraphs:
        marker = paragraph.get('marker', '')
        title = paragraph.find('{eregs}title')

        if marker == 'none' or marker is None:
            marker = ''
        par_text = (marker + ' ' + xml_node_text(
            paragraph.find('{eregs}content'))).strip()

        par_label = paragraph.get('label')
        if wants_intro_text(paragraph.getparent()) and is_intro_text(paragraph):
            # This intro paragraph will get attached to its parent node by
            # build_reg_tree
            par_label = paragraph.getparent().get('label')

        total_offset = get_offset(paragraph, marker, title)

        cite_positions = OrderedDict()
        cite_targets = OrderedDict()

        content = apply_formatting(paragraph.find('{eregs}content'))
        cites = content.findall('{eregs}ref[@reftype="internal"]')
        citation_list = []
        for cite in cites:
            target = cite.get('target').split('-')
            text = cite.text

            running_par_text = content.text or ''
            for child in content.getchildren():
                if child != cite:
                    tail = child.tail or ''
                    running_par_text += (child.text or '') + tail
                else:
                    break

            cite_position = len(running_par_text) + total_offset
            cite_positions.setdefault(text, []).append(cite_position)
            cite_targets[text] = target
            running_par_text = ''

        for cite, positions in cite_positions.items():
            # positions = find_all_occurrences(par_text, text)
            for pos in positions:
                # Handle empty citations
                try:
                    cite_dict = {'citation': cite_targets[cite],
                                 'offsets': [[pos, pos + len(cite)]]}
                except TypeError as e:
                    print("TypeError occurred: {}".format(str(e)))
                    print("Look for a reference without text in {} @ pos {}".format(par_label, positions))
                    raise e

                if cite_dict not in citation_list:
                    citation_list.append(cite_dict)

        if citation_list != []:
            layer_dict[par_label] = citation_list

    return layer_dict