def generate_diff(left_xml, right_xml):
    """ Given two full RegML trees, generate a dictionary of changes
        between the two in the style of regulations-parser.
        This wraps regulatons-parser's changes_between() function. """
    left_tree = build_reg_tree(left_xml)
    right_tree = build_reg_tree(right_xml)
    diff = dict(changes_between(FrozenNode.from_node(left_tree),
                                FrozenNode.from_node(right_tree)))
    return diff
Esempio n. 2
0
def generate_diff(left_xml, right_xml):
    """ Given two full RegML trees, generate a dictionary of changes
        between the two in the style of regulations-parser.
        This wraps regulatons-parser's changes_between() function. """
    left_tree = build_reg_tree(left_xml)
    right_tree = build_reg_tree(right_xml)
    diff = dict(
        changes_between(FrozenNode.from_node(left_tree),
                        FrozenNode.from_node(right_tree)))
    return diff
def diff_driver(regulation_files):

    pairs = combinations(regulation_files, 2)
    for pair in pairs:
        with open(pair[0], 'r') as f:
            xml_tree1 = etree.fromstring(f.read())

        with open(pair[1], 'r') as f:
            xml_tree2 = etree.fromstring(f.read())

        reg_tree1 = build_reg_tree(xml_tree1)
        reg_tree2 = build_reg_tree(xml_tree2)

        recursive_comparison(reg_tree1, reg_tree2)
    def test_height(self):
        xml_tree = etree.fromstring(test_xml)
        reg_tree = build_reg_tree(xml_tree)

        result = reg_tree.height()

        self.assertEqual(result, 5)
Esempio n. 5
0
def ecfr_notice(title, cfr_part, notice, applies_to, act_title,
        act_section, with_version=False, without_notice=False):
    """ Generate RegML for a single notice from eCFR XML. """

    # Get the notice the new one applies to
    with open(find_file(os.path.join(cfr_part, applies_to)), 'r') as f:
        reg_xml = f.read()
    parser = etree.XMLParser(huge_tree=True)
    xml_tree = etree.fromstring(reg_xml, parser)
    doc_number = xml_tree.find('.//{eregs}documentNumber').text

    # Validate the file relative to schema
    validator = get_validator(xml_tree)

    # Get the ecfr builder
    builder = Builder(cfr_title=title,
                      cfr_part=cfr_part,
                      doc_number=doc_number,
                      checkpointer=None,
                      writer_type='XML')

    # Fetch the notices from the FR API and find the notice we're
    # looking for
    builder.fetch_notices_json()
    print([n['document_number'] for n in builder.notices_json])
    notice_json = next((n for n in builder.notices_json
                        if n['document_number'] == notice))

    # Build the notice
    notice = builder.build_single_notice(notice_json)[0]

    if 'changes' not in notice:
        print('There are no changes in this notice to apply.')
        return

    # We've successfully fetched and parsed the new notice.
    # Build a the reg tree and layers for the notice it applies to.
    old_tree = build_reg_tree(xml_tree)

    # Build the new reg tree from the old_tree + notice changes
    last_version = doc_number
    version = notice['document_number']
    merged_changes = builder.merge_changes(version, notice['changes'])
    reg_tree = compile_regulation(old_tree, merged_changes)
    layer_cache = LayerCacheAggregator()
    layers = builder.generate_layers(reg_tree,
                                     [act_title, act_section],
                                     layer_cache)

    # Write the notice file
    if not without_notice:
        builder.write_notice(version,
                             old_tree=old_tree,
                             reg_tree=reg_tree,
                             layers=layers,
                             last_version=last_version)

    # Write the regulation file for the new notice
    if with_version:
        builder.write_regulation(new_tree, layers=layers)
    def test_section_callout(self):
        reg_xml = etree.fromstring("""
          <section label="1024-3" sectionNum="3" xmlns="eregs">
            <subject>§ 1024.3 Questions or suggestions from public and copies of public guidance documents.</subject>
            <paragraph label="1024-3-p1" marker="">
              <content>
                <callout type="note">
                  <line>Note:</line>
                  <line>This is a test callout.</line>
                </callout>
              </content>
            </paragraph>
          </section>""")
        result = build_reg_tree(reg_xml)

        expected_result = OrderedDict([(u'children',
                                      [OrderedDict([(u'children', []),
                                                    (u'label', [u'1024', u'3', u'p1']),
                                                    (u'node_type', u'regtext'),
                                                    (u'text', u'Note:\n                  This is a test callout.'),
                                                    (u'marker', u''),
                                                    ])]),
                                       (u'label',
                                        [u'1024', u'3']),
                                       (u'node_type', u'regtext'),
                                       (u'text', u''),
                                       (u'title', u'\xa7 1024.3 Questions or suggestions from public and copies of public guidance documents.')
                                      ]
                                     )


        # This callout should correctly get identified as NOT an intro paragraph, and its content should stay in
        # an element with the paragraph's label and not smushed into the section's label
        self.assertEqual(expected_result, result.to_json())
    def test_appendix_graphic(self):
        reg_xml = etree.fromstring("""
          <appendixSection appendixSecNum="1" label="1013-A-1" xmlns="eregs">
              <subject>A-1—Model Open-End or Finance Vehicle Lease Disclosures</subject>
              <paragraph label="1013-A-1-p1" marker="">
                <content>
                  <graphic>
                    <altText></altText>
                    <text>![](ER19DE11.010)</text>
                    <url>https://s3.amazonaws.com/images.federalregister.gov/ER19DE11.010/original.gif</url>
                  </graphic>
                </content>
              </paragraph>
          </appendixSection>""")
        result = build_reg_tree(reg_xml)

        expected_result = OrderedDict([(u'children',
                              [OrderedDict([(u'children', []),
                                          (u'label', [u'1013', u'A', u'1', u'p1']),
                                          (u'node_type', u'appendix'),
                                          (u'text', '![](ER19DE11.010)'),
                                          (u'marker', ''),
                                  ])]),
                                  (u'label', [u'1013', u'A', u'1']), (u'node_type', u'appendix'),
                                  (u'text', u''),
                                  (u'title', u'A-1\u2014Model Open-End or Finance Vehicle Lease Disclosures')])

        # This graphic should correctly get identified as NOT an intro paragraph, and its content should stay in
        # an element with the paragraph's label and not smushed into the section's label
        self.assertEqual(expected_result, result.to_json())
    def test_markerless_nodes(self):
        """ Make sure marker: '' comes through in the json """
        xml_tree = etree.fromstring(test_xml)
        reg_tree = build_reg_tree(xml_tree)

        parent = reg_tree.find_node(lambda n: n.string_label == '1234-1-a')[0]
        self.assertEqual(parent.children[0].to_json()['marker'], '')
        self.assertEqual(parent.children[1].to_json()['marker'], '')
    def test_labels(self):

        xml_tree = etree.fromstring(test_xml)
        reg_tree = build_reg_tree(xml_tree)

        result = reg_tree.labels()

        self.assertEqual(1, 1)
    def test_flatten_tree(self):

        xml_tree = etree.fromstring(test_xml)
        reg_tree = build_reg_tree(xml_tree)

        result = reg_tree.flatten()

        self.assertEqual(1, 1)
    def test_find_node_single(self):

        xml_tree = etree.fromstring(test_xml)
        reg_tree = build_reg_tree(xml_tree)

        def predicate(node):
            if node.string_label == '1234-1-a':
                return True
            else:
                return False

        result = reg_tree.find_node(predicate)

        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].string_label, '1234-1-a')
        self.assertEqual(result[0].text, "a I'm a marked paragraph")
        self.assertEqual(result[0].marker, "a")
        self.assertEqual(result[0].depth, 3)
    def test_appendix_callout(self):
        reg_xml = etree.fromstring("""
        <appendixSection appendixSecNum="6" label="1024-A-h6" xmlns="eregs">
          <subject>Instructions for Completing HUD-1A</subject>
          <paragraph label="1024-A-h6-p92" marker="">
            <content>
              <callout type="note">
                <line>Note:</line>
                <line>The HUD-1A is an optional form that may be used for refinancing and subordinate-lien federally related mortgage loans, as well as for any other one-party transaction that does not involve the transfer of title to residential real property. The HUD-1 form may also be used for such transactions, by utilizing the borrower's side of the HUD-1 and following the relevant parts of the instructions as set forth above. The use of either the HUD-1 or HUD-1A is not mandatory for open-end lines of credit (home-equity plans), as long as the provisions of Regulation Z are followed.</line>
              </callout>
            </content>
          </paragraph>
        </appendixSection>""")
        result = build_reg_tree(reg_xml)

        expected_result = {
            "children": [
                {
                    "children": [],
                    "label": [
                        "1024",
                        "A",
                        "h6",
                        "p92"
                    ],
                    "marker": "",
                    "node_type": "appendix",
                    "text": "Note:\n                The HUD-1A is an optional form that may be used for refinancing and subordinate-lien federally related mortgage loans, as well as for any other one-party transaction that does not involve the transfer of title to residential real property. The HUD-1 form may also be used for such transactions, by utilizing the borrower's side of the HUD-1 and following the relevant parts of the instructions as set forth above. The use of either the HUD-1 or HUD-1A is not mandatory for open-end lines of credit (home-equity plans), as long as the provisions of Regulation Z are followed."
                }
            ],
            "label": [
                "1024",
                "A",
                "h6"
            ],
            "node_type": "appendix",
            "text": "",
            "title": "Instructions for Completing HUD-1A"
        }

        # This callout should correctly get identified as NOT an intro paragraph, and its content should stay in
        # an element with the paragraph's label and not smushed into the appendixSection's label
        self.assertEqual(expected_result, result.to_json())
    def test_find_node_multiple(self):

        xml_tree = etree.fromstring(test_xml)
        reg_tree = build_reg_tree(xml_tree)

        def predicate(node):
            if node.text.find('marked') > -1:
                return True
            else:
                return False

        result = reg_tree.find_node(predicate)

        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].string_label, '1234-1')
        self.assertEqual(result[0].text, "I'm an unmarked paragraph")
        self.assertEqual(result[0].marker, None)
        self.assertEqual(result[1].string_label, '1234-1-a')
        self.assertEqual(result[1].text, "a I'm a marked paragraph")
        self.assertEqual(result[1].marker, "a")
    def test_build_reg_tree(self):
        # Do some basic introspection of the outcome
        node = build_reg_tree(self.root)

        node_dict = node.to_json()
        self.assertEqual(node_dict['title'], 'REGULATION TESTING')
        self.assertEqual(node_dict['label'], ['1234'])
        self.assertEqual(len(node_dict['children']), 3)
        self.assertEqual(node.depth, 0)

        subpart_dict = node_dict['children'][0]
        self.assertEqual(subpart_dict['label'], ['1234', 'Subpart'])
        self.assertEqual(node.children[0].depth, 1)

        appendix_dict = node_dict['children'][1]
        self.assertEqual(appendix_dict['label'], ['1234', 'A'])
        self.assertEqual(node.children[1].depth, 1)

        interp_dict = node_dict['children'][2]
        self.assertEqual(interp_dict['label'], ['1234', 'Interp'])
        self.assertEqual(node.children[2].depth, 1)
 def test_build_reg_tree_intro_para(self):
     tree = etree.fromstring("""
     <section label="foo" xmlns="eregs">
       <subject>Some Subject</subject>
       <paragraph label="foo-p1" marker="">
         <content>
           An unmarked intro paragraph.
         </content>
       </paragraph>
       <paragraph label="foo-a" marker="a">
         <content>A marked paragraph</content>
       </paragraph>
     </section>
     """)
     expected_result = {
         'children': [
             {
                 'children': [],
                 'label': [
                     'foo',
                     'a'
                 ],
                 'node_type': 'regtext',
                 'text': 'a A marked paragraph',
                 'marker': 'a'
             }
         ],
         'label': [
             'foo'
         ],
         'node_type': 'regtext',
         'text': 'An unmarked intro paragraph.',
         'title': 'Some Subject'
     }
     result = build_reg_tree(tree)
     self.assertEqual(expected_result, result.to_json())
Esempio n. 16
0
def generate_json(regulation_file, check_terms=False):
    with open(find_file(regulation_file), 'r') as f:
        reg_xml = f.read()
    parser = etree.XMLParser(huge_tree=True)
    xml_tree = etree.fromstring(reg_xml, parser)

    # Validate the file relative to schema
    validator = get_validator(xml_tree)

    reg_tree = build_reg_tree(xml_tree)
    reg_number = reg_tree.label[0]

    paragraph_markers = build_paragraph_marker_layer(xml_tree)
    internal_citations = build_internal_citations_layer(xml_tree)
    external_citations = build_external_citations_layer(xml_tree)
    terms = build_terms_layer(xml_tree)
    meta = build_meta_layer(xml_tree)
    toc = build_toc_layer(xml_tree)
    keyterms = build_keyterm_layer(xml_tree)
    graphics = build_graphics_layer(xml_tree)
    formatting = build_formatting_layer(xml_tree)
    interps = build_interp_layer(xml_tree)
    analysis = build_analysis(xml_tree)
    notice_dict = build_notice(xml_tree)

    # if the validator had problems then we should report them and bail out

    validator.validate_terms(xml_tree, terms)
    validator.validate_internal_cites(xml_tree, internal_citations)
    if check_terms:
        validator.validate_term_references(xml_tree, terms, regulation_file)
    for event in validator.events:
        print(str(event))

    reg_tree.include_children = True
    reg_json = reg_tree.to_json()

    notice = xml_tree.find('.//{eregs}documentNumber').text
    version = os.path.split(regulation_file)[-1].replace('.xml', '')
    if notice != version:
        print('Notice ({}) different from version ({}), '
              'using version'.format(notice, version))
        notice = version

    write_layer(reg_json, reg_number, notice, 'regulation')
    write_layer(meta, reg_number, notice, 'layer/meta')
    write_layer(paragraph_markers, reg_number, notice,
                'layer/paragraph-markers')
    write_layer(internal_citations, reg_number, notice,
                'layer/internal-citations')
    write_layer(external_citations, reg_number, notice,
                'layer/external-citations')
    write_layer(terms, reg_number, notice, 'layer/terms')
    write_layer(toc, reg_number, notice, 'layer/toc')
    write_layer(keyterms, reg_number, notice, 'layer/keyterms')
    write_layer(graphics, reg_number, notice, 'layer/graphics')
    write_layer(formatting, reg_number, notice, 'layer/formatting')
    write_layer(interps, reg_number, notice, 'layer/interpretations')
    write_layer(analysis, reg_number, notice, 'layer/analyses')
    write_layer(notice_dict, reg_number, notice, 'notice')

    return reg_number, notice, xml_tree
def parser_driver(regulation_file,
                  check_terms=False,
                  correct_interps=False,
                  headerize_interps=False,
                  fix_missed_cites=False):
    with open(regulation_file, 'r') as f:
        reg_xml = f.read()
    xml_tree = etree.fromstring(reg_xml)

    # validate relative to schema
    validator = EregsValidator(settings.XSD_FILE)
    validator.validate_reg(xml_tree)

    if not validator.is_valid:
        for event in validator.events:
            print(str(event))
        sys.exit(0)

    reg_tree = build_reg_tree(xml_tree)
    reg_number = reg_tree.label[0]
    # we can correct interps right away if necessary
    if correct_interps:
        validator.insert_interp_markers(xml_tree, regulation_file)
    if headerize_interps:
        validator.headerize_interps(xml_tree, regulation_file)
    if fix_missed_cites:
        validator.fix_omitted_cites(xml_tree, regulation_file)

    paragraph_markers = build_paragraph_marker_layer(xml_tree)
    internal_citations = build_internal_citations_layer(xml_tree)
    external_citations = build_external_citations_layer(xml_tree)
    terms = build_terms_layer(xml_tree)
    meta = build_meta_layer(xml_tree)
    toc = build_toc_layer(xml_tree)
    keyterms = build_keyterm_layer(xml_tree)
    graphics = build_graphics_layer(xml_tree)
    formatting = build_formatting_layer(xml_tree)
    interps = build_interp_layer(xml_tree)
    analysis = build_analysis(xml_tree)
    notice_dict = build_notice(xml_tree)

    # if the validator had problems then we should report them and bail out

    validator.validate_terms(xml_tree, terms)
    validator.validate_internal_cites(xml_tree, internal_citations)
    if check_terms:
        validator.validate_term_references(xml_tree, terms, regulation_file)
    for event in validator.events:
        print(str(event))

    reg_tree.include_children = True
    reg_json = reg_tree.to_json()

    notice = xml_tree.find('.//{eregs}documentNumber').text
    version = os.path.split(regulation_file)[-1].replace('.xml', '')
    if notice != version:
        print('Notice ({}) different from version ({}), using version'.format(notice, version))
        notice = version

    write_layer(reg_json, reg_number, notice, 'regulation')
    write_layer(meta, reg_number, notice, 'layer/meta')
    write_layer(paragraph_markers, reg_number, notice,
                'layer/paragraph-markers')
    write_layer(internal_citations, reg_number, notice,
                'layer/internal-citations')
    write_layer(external_citations, reg_number, notice,
                'layer/external-citations')
    write_layer(terms, reg_number, notice, 'layer/terms')
    write_layer(toc, reg_number, notice, 'layer/toc')
    write_layer(keyterms, reg_number, notice, 'layer/keyterms')
    write_layer(graphics, reg_number, notice, 'layer/graphics')
    write_layer(formatting, reg_number, notice, 'layer/formatting')
    write_layer(interps, reg_number, notice, 'layer/interpretations')
    write_layer(analysis, reg_number, notice, 'layer/analyses')
    write_layer(notice_dict, reg_number, notice, 'notice')