def test_build_toc_layer_appendix_section(self):
     tree = etree.fromstring("""
     <appendix xmlns="eregs" appendixLetter="A" label="1234-A">
       <appendixTitle>Appendix A</appendixTitle>
       <appendixSection appendixSecNum="1" label="1234-A-1">
         <subject>Section 1</subject>
         <tableOfContents>
           <tocAppEntry target="1234-A-1-A">
             <appendixLetter>A-1-A</appendixLetter>
             <appendixSubject>Something</appendixSubject>
           </tocAppEntry>
         </tableOfContents>
         <paragraph label="1234-A-1-A" marker="">
           <content>Something here</content>
         </paragraph>
       </appendixSection>
     </appendix>
     """)
     expected_result = {
         '1234-A-1': [
             {u'index': [u'1234', u'A', u'1', 'A'], u'title': 'Something'}
         ],
     }
     result = build_toc_layer(tree)
     self.assertEqual(expected_result, result)
 def test_build_toc_layer_subpart(self):
     tree = etree.fromstring("""
     <subpart xmlns="eregs" subpartLetter="A" label="1234-Subpart-A">
       <title>General</title>
       <tableOfContents label="1234-Subpart-A-TOC">
         <tocSecEntry target="1234-1">
           <sectionNum>1</sectionNum>
           <sectionSubject>§ 1234.1</sectionSubject>
         </tocSecEntry>
         <tocSecEntry target="1234-1">
           <sectionNum>1</sectionNum>
           <sectionSubject>§ 1234.2</sectionSubject>
         </tocSecEntry>
       </tableOfContents>
       <content></content>
     </subpart>
     """)
     expected_result = {
         '1234-Subpart-A': [
             {'index': [u'1234', u'1'], 'title': u'\xa7 1234.1'},
             {'index': [u'1234', u'1'], 'title': u'\xa7 1234.2'}
         ],
     }
     result = build_toc_layer(tree)
     self.assertEqual(expected_result, result)
 def test_build_toc_layer_appendix(self):
     tree = etree.fromstring("""
     <appendix xmlns="eregs" appendixLetter="A" label="1234-A">
       <appendixTitle>Appendix A</appendixTitle>
       <tableOfContents>
         <tocAppEntry target="1234-A-1">
           <appendixLetter>A-1</appendixLetter>
           <appendixSubject>Some Subject</appendixSubject>
         </tocAppEntry>
       </tableOfContents>
     </appendix>
     """)
     expected_result = {
         '1234-A': [
             {u'index': [u'1234', u'A', u'1'], u'title': 'Some Subject'}
         ],
     }
     result = build_toc_layer(tree)
     self.assertEqual(expected_result, result)
 def test_build_toc_layer_section(self):
     tree = etree.fromstring("""
     <section xmlns="eregs" label="1234-1" sectionNum="1">
       <subject>§ 1234.1</subject>
       <tableOfContents label="1234-Subpart-A-TOC">
         <tocSecEntry target="1234-1-a">
           <sectionNum>1</sectionNum>
           <sectionSubject>§ 1234.1(a)</sectionSubject>
         </tocSecEntry>
       </tableOfContents>
       <paragraph label="1234-1-a" marker="a">
         <content>This is a section with its own TOC</content>
       </paragraph>
     </section>
     """)
     expected_result = {
         '1234-1': [
             {u'index': [u'1234', u'1', u'a'], u'title': u'\xa7 1234.1(a)'}
         ],
     }
     result = build_toc_layer(tree)
     self.assertEqual(expected_result, result)
 def test_build_toc_layer_part(self):
     tree = etree.fromstring("""
     <part xmlns="eregs" label="1234">
       <tableOfContents>
         <tocSecEntry target="1234-1">
           <sectionNum>1</sectionNum>
           <sectionSubject>§ 1234.1</sectionSubject>
         </tocSecEntry>
         <tocAppEntry target="1234-A">
           <appendixLetter>A</appendixLetter>
           <appendixSubject>Appendix</appendixSubject>
         </tocAppEntry>
       </tableOfContents>
       <content/>
     </part>
     """)
     expected_result = {
         '1234': [
             {'index': [u'1234', u'1'], 'title': u'\xa7 1234.1'},
             {'index': [u'1234', u'A'], 'title': 'Appendix'}
         ],
     }
     result = build_toc_layer(tree)
     self.assertEqual(expected_result, result)
Exemple #6
0
def generate_json(regulation_file, check_terms=False):
    with open(find_file(regulation_file), 'r') as f:
        reg_xml = f.read()
    parser = etree.XMLParser(huge_tree=True)
    xml_tree = etree.fromstring(reg_xml, parser)

    # Validate the file relative to schema
    validator = get_validator(xml_tree)

    reg_tree = build_reg_tree(xml_tree)
    reg_number = reg_tree.label[0]

    paragraph_markers = build_paragraph_marker_layer(xml_tree)
    internal_citations = build_internal_citations_layer(xml_tree)
    external_citations = build_external_citations_layer(xml_tree)
    terms = build_terms_layer(xml_tree)
    meta = build_meta_layer(xml_tree)
    toc = build_toc_layer(xml_tree)
    keyterms = build_keyterm_layer(xml_tree)
    graphics = build_graphics_layer(xml_tree)
    formatting = build_formatting_layer(xml_tree)
    interps = build_interp_layer(xml_tree)
    analysis = build_analysis(xml_tree)
    notice_dict = build_notice(xml_tree)

    # if the validator had problems then we should report them and bail out

    validator.validate_terms(xml_tree, terms)
    validator.validate_internal_cites(xml_tree, internal_citations)
    if check_terms:
        validator.validate_term_references(xml_tree, terms, regulation_file)
    for event in validator.events:
        print(str(event))

    reg_tree.include_children = True
    reg_json = reg_tree.to_json()

    notice = xml_tree.find('.//{eregs}documentNumber').text
    version = os.path.split(regulation_file)[-1].replace('.xml', '')
    if notice != version:
        print('Notice ({}) different from version ({}), '
              'using version'.format(notice, version))
        notice = version

    write_layer(reg_json, reg_number, notice, 'regulation')
    write_layer(meta, reg_number, notice, 'layer/meta')
    write_layer(paragraph_markers, reg_number, notice,
                'layer/paragraph-markers')
    write_layer(internal_citations, reg_number, notice,
                'layer/internal-citations')
    write_layer(external_citations, reg_number, notice,
                'layer/external-citations')
    write_layer(terms, reg_number, notice, 'layer/terms')
    write_layer(toc, reg_number, notice, 'layer/toc')
    write_layer(keyterms, reg_number, notice, 'layer/keyterms')
    write_layer(graphics, reg_number, notice, 'layer/graphics')
    write_layer(formatting, reg_number, notice, 'layer/formatting')
    write_layer(interps, reg_number, notice, 'layer/interpretations')
    write_layer(analysis, reg_number, notice, 'layer/analyses')
    write_layer(notice_dict, reg_number, notice, 'notice')

    return reg_number, notice, xml_tree
def parser_driver(regulation_file,
                  check_terms=False,
                  correct_interps=False,
                  headerize_interps=False,
                  fix_missed_cites=False):
    with open(regulation_file, 'r') as f:
        reg_xml = f.read()
    xml_tree = etree.fromstring(reg_xml)

    # validate relative to schema
    validator = EregsValidator(settings.XSD_FILE)
    validator.validate_reg(xml_tree)

    if not validator.is_valid:
        for event in validator.events:
            print(str(event))
        sys.exit(0)

    reg_tree = build_reg_tree(xml_tree)
    reg_number = reg_tree.label[0]
    # we can correct interps right away if necessary
    if correct_interps:
        validator.insert_interp_markers(xml_tree, regulation_file)
    if headerize_interps:
        validator.headerize_interps(xml_tree, regulation_file)
    if fix_missed_cites:
        validator.fix_omitted_cites(xml_tree, regulation_file)

    paragraph_markers = build_paragraph_marker_layer(xml_tree)
    internal_citations = build_internal_citations_layer(xml_tree)
    external_citations = build_external_citations_layer(xml_tree)
    terms = build_terms_layer(xml_tree)
    meta = build_meta_layer(xml_tree)
    toc = build_toc_layer(xml_tree)
    keyterms = build_keyterm_layer(xml_tree)
    graphics = build_graphics_layer(xml_tree)
    formatting = build_formatting_layer(xml_tree)
    interps = build_interp_layer(xml_tree)
    analysis = build_analysis(xml_tree)
    notice_dict = build_notice(xml_tree)

    # if the validator had problems then we should report them and bail out

    validator.validate_terms(xml_tree, terms)
    validator.validate_internal_cites(xml_tree, internal_citations)
    if check_terms:
        validator.validate_term_references(xml_tree, terms, regulation_file)
    for event in validator.events:
        print(str(event))

    reg_tree.include_children = True
    reg_json = reg_tree.to_json()

    notice = xml_tree.find('.//{eregs}documentNumber').text
    version = os.path.split(regulation_file)[-1].replace('.xml', '')
    if notice != version:
        print('Notice ({}) different from version ({}), using version'.format(notice, version))
        notice = version

    write_layer(reg_json, reg_number, notice, 'regulation')
    write_layer(meta, reg_number, notice, 'layer/meta')
    write_layer(paragraph_markers, reg_number, notice,
                'layer/paragraph-markers')
    write_layer(internal_citations, reg_number, notice,
                'layer/internal-citations')
    write_layer(external_citations, reg_number, notice,
                'layer/external-citations')
    write_layer(terms, reg_number, notice, 'layer/terms')
    write_layer(toc, reg_number, notice, 'layer/toc')
    write_layer(keyterms, reg_number, notice, 'layer/keyterms')
    write_layer(graphics, reg_number, notice, 'layer/graphics')
    write_layer(formatting, reg_number, notice, 'layer/formatting')
    write_layer(interps, reg_number, notice, 'layer/interpretations')
    write_layer(analysis, reg_number, notice, 'layer/analyses')
    write_layer(notice_dict, reg_number, notice, 'notice')