コード例 #1
0
    def test_build_section_by_section_repeat_label(self):
        xml = """
        <ROOT>
            <HD SOURCE="H2">This references 23(c)</HD>
            <P>Content 1</P>
            <HD SOURCE="H3">SO DOES THIS! 23(c) continued</HD>
            <P>Content 2</P>
        </ROOT>"""
        sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
        structures = sxs.build_section_by_section(sxs_lst, 23, '876')
        self.assertEqual(len(structures), 1)
        struct1 = structures[0]
        self.assertEqual(struct1['labels'], ['876-23-c'])
        self.assertEqual(['Content 1'], struct1['paragraphs'])
        self.assertEqual(len(struct1['children']), 1)
        struct2 = struct1['children'][0]
        self.assertEqual(['Content 2'], struct2['paragraphs'])
        self.assertFalse('labels' in struct2)

        # Now the same, but on the same H level
        xml = """
        <ROOT>
            <HD SOURCE="H2">This references 23(c)</HD>
            <P>Content 1</P>
            <HD SOURCE="H2">SO DOES THIS! 23(c) continued</HD>
            <P>Content 2</P>
        </ROOT>"""
        sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
        structures = sxs.build_section_by_section(sxs_lst, 23, '876')
        self.assertEqual(len(structures), 1)
        struct1 = structures[0]
        self.assertEqual(struct1['labels'], ['876-23-c'])
        self.assertEqual(['Content 1'], struct1['paragraphs'])
        self.assertEqual(len(struct1['children']), 1)
        struct2 = struct1['children'][0]
        self.assertEqual(['Content 2'], struct2['paragraphs'])
        self.assertFalse('labels' in struct2)

        # Semi-repeated
        xml = """
        <ROOT>
            <HD SOURCE="H2">Appendices A and B</HD>
            <P>Content 1</P>
            <HD SOURCE="H2">Appendix B</HD>
            <P>Content 2</P>
        </ROOT>"""
        sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
        structures = sxs.build_section_by_section(sxs_lst, 23, '876')
        self.assertEqual(len(structures), 1)
        struct1 = structures[0]
        self.assertEqual(struct1['labels'], ['876-A', '876-B'])
        self.assertEqual(['Content 1'], struct1['paragraphs'])
        self.assertEqual(len(struct1['children']), 1)
        struct2 = struct1['children'][0]
        self.assertEqual(['Content 2'], struct2['paragraphs'])
        self.assertFalse('labels' in struct2)
コード例 #2
0
    def test_build_section_by_section_backtrack(self):
        xml = """
        <ROOT>
            <HD SOURCE="H2">This references 23(c)(3)</HD>
            <P>Content 1</P>
            <HD SOURCE="H2">Off handed comment about 23(c)</HD>
            <P>Content 2</P>
        </ROOT>"""
        sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
        structures = sxs.build_section_by_section(sxs_lst, 23, '876')
        self.assertEqual(len(structures), 1)
        struct1 = structures[0]
        self.assertEqual(struct1['labels'], ['876-23-c-3'])
        self.assertEqual(['Content 1'], struct1['paragraphs'])
        self.assertEqual(len(struct1['children']), 1)
        struct2 = struct1['children'][0]
        self.assertEqual(['Content 2'], struct2['paragraphs'])
        self.assertFalse('labels' in struct2)

        # Same, but deeper H level
        xml = """
        <ROOT>
            <HD SOURCE="H2">This references 23(c)(3)</HD>
            <P>Content 1</P>
            <HD SOURCE="H3">Off handed comment about 23(c)</HD>
            <P>Content 2</P>
        </ROOT>"""
        sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
        structures = sxs.build_section_by_section(sxs_lst, 23, '876')
        self.assertEqual(len(structures), 1)
        struct1 = structures[0]
        self.assertEqual(struct1['labels'], ['876-23-c-3'])
        self.assertEqual(['Content 1'], struct1['paragraphs'])
        self.assertEqual(len(struct1['children']), 1)
        struct2 = struct1['children'][0]
        self.assertEqual(['Content 2'], struct2['paragraphs'])
        self.assertFalse('labels' in struct2)

        # No part then part
        xml = """
        <ROOT>
            <HD SOURCE="H3">This references 23(c)</HD>
            <HD SOURCE="H3">Off handed comment about section 1111.23</HD>
            <P>Content 2</P>
        </ROOT>"""
        sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
        structures = sxs.build_section_by_section(sxs_lst, 22, '1111')
        self.assertEqual(len(structures), 1)
        struct1 = structures[0]
        self.assertEqual(struct1['labels'], ['1111-23-c'])
        self.assertEqual([], struct1['paragraphs'])
        self.assertEqual(len(struct1['children']), 1)
        struct2 = struct1['children'][0]
        self.assertEqual(['Content 2'], struct2['paragraphs'])
        self.assertFalse('labels' in struct2)
コード例 #3
0
def process_xml(notice, notice_xml):
    """Pull out relevant fields from the xml and add them to the notice"""

    xml_chunk = notice_xml.xpath('//FURINF/P')
    if xml_chunk:
        notice['contact'] = xml_chunk[0].text

    addresses = fetch_addresses(notice_xml)
    if addresses:
        notice['addresses'] = addresses

    sxs = find_section_by_section(notice_xml)
    sxs = build_section_by_section(sxs, notice['cfr_part'],
                                   notice['meta']['start_page'])
    notice['section_by_section'] = sxs

    context = []
    amends = []
    for par in notice_xml.xpath('//AMDPAR'):
        amend_set, context = parse_amdpar(par, context)
        amends.extend(amend_set)
    if amends:
        notice['amendments'] = amends

    return notice
コード例 #4
0
    def test_build_section_by_section_dup_child(self):
        xml = """
        <ROOT>
            <HD SOURCE="H2">References 31(a) and (b)</HD>
            <P>Content 1</P>
            <HD SOURCE="H3">Subcontent</HD>
            <P>Content 2</P>
            <HD SOURCE="H3">References 31(b)(1)</HD>
            <P>Content 3</P>
        </ROOT>"""
        sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
        structures = sxs.build_section_by_section(sxs_lst, 23, '876')
        self.assertEqual(len(structures), 1)
        struct1 = structures[0]
        self.assertEqual(struct1['labels'], ['876-31-a', '876-31-b'])
        self.assertEqual(['Content 1'], struct1['paragraphs'])
        self.assertEqual(len(struct1['children']), 2)
        struct1_h, struct2 = struct1['children']

        self.assertEqual(struct1_h['title'], 'Subcontent')
        self.assertEqual(['Content 2'], struct1_h['paragraphs'])
        self.assertEqual(len(struct1_h['children']), 0)

        self.assertEqual(struct2['labels'], ['876-31-b-1'])
        self.assertEqual(['Content 3'], struct2['paragraphs'])
        self.assertEqual(len(struct2['children']), 0)
コード例 #5
0
 def test_build_section_by_section_same_level(self):
     """Check that labels are being added correctly"""
     xml = """
     <ROOT>
         <HD SOURCE="HD2">Section 99.3 Something Here</HD>
         <HD SOURCE="HD3">3(q)(4) More Info</HD>
         <P>Content 1</P>
         <HD SOURCE="HD3">Subheader, Really</HD>
         <P>Content 2</P>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 765, '99')
     self.assertEqual(1, len(structures))
     self.assertEqual(structures[0], {
         'title': 'Section 99.3 Something Here',
         'labels': ['99-3'],
         'paragraphs': [],
         'page': 765,
         'footnote_refs': [],
         'children': [{
             'title': '3(q)(4) More Info',
             'labels': ['99-3-q-4'],
             'paragraphs': ['Content 1'],
             'page': 765,
             'footnote_refs': [],
             'children': [{
                 'title': 'Subheader, Really',
                 'paragraphs': ['Content 2'],
                 'footnote_refs': [],
                 'children': [],
                 'page': 765
             }]
         }]
     })
コード例 #6
0
 def test_build_section_by_section_extra_tags(self):
     """Check that labels are being added correctly"""
     xml = """
     <ROOT>
         <HD SOURCE="HD2">Section 99.3 Info</HD>
         <P>Content<PRTPAGE P="50249"/>1</P>
         <P>Content <SU>99</SU><FTREF />2</P>
         <P>Content <E T="03">Emph</E></P>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 939, '99')
     self.assertEqual(1, len(structures))
     self.assertEqual(
         structures[0], {
             'title':
             'Section 99.3 Info',
             'labels': ['99-3'],
             'page':
             939,
             'paragraphs': [
                 'Content 1', 'Content  2',
                 'Content <em data-original="E-03">Emph</em>'
             ],
             'footnote_refs': [{
                 'paragraph': 1,
                 'reference': '99',
                 'offset': 8
             }],
             'children': []
         })
コード例 #7
0
ファイル: build.py プロジェクト: eregs/regulations-parser
def process_sxs(notice, notice_xml):
    """ Find and build SXS from the notice_xml. """
    sxs = find_section_by_section(notice_xml)
    # note we will continue to use cfr_parts[0] as the default SxS label until
    # we find a counter example
    sxs = build_section_by_section(sxs, notice['meta']['start_page'],
                                   notice['cfr_parts'][0])
    notice['section_by_section'] = sxs
コード例 #8
0
def process_sxs(notice, notice_xml):
    """ Find and build SXS from the notice_xml. """
    sxs = find_section_by_section(notice_xml)
    # note we will continue to use cfr_parts[0] as the default SxS label until
    # we find a counter example
    sxs = build_section_by_section(sxs, notice['meta']['start_page'],
                                   notice['cfr_parts'][0])
    notice['section_by_section'] = sxs
コード例 #9
0
 def test_build_section_by_section_multiple(self):
     xml = """
     <ROOT>
         <HD SOURCE="H2">Comments 22(a)-5, 22(a)-6, and 22(b)</HD>
         <P>Content</P>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 23, '876')
     self.assertEqual(len(structures), 1)
     self.assertEqual(structures[0]['labels'],
                      ['876-22-a-Interp-5', '876-22-a-Interp-6',
                       '876-22-b-Interp'])
コード例 #10
0
 def test_build_section_by_section(self):
     xml = """
     <ROOT>
         <HD SOURCE="HD3">Section Header</HD>
         <P>Content 1</P>
         <P>Content 2</P>
         <HD SOURCE="HD4">Sub Section Header</HD>
         <P>Content 3</P>
         <HD SOURCE="HD4">Another</HD>
         <P>Content 4</P>
         <HD SOURCE="HD3">4(b) Header</HD>
         <P>Content 5</P>
         <FP>Content 6</FP>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 83, '100')
     self.assertEqual(2, len(structures))
     self.assertEqual(structures[0], {
         'title': 'Section Header',
         'paragraphs': [
             'Content 1',
             'Content 2'
             ],
         'footnote_refs': [],
         'children': [
             {
                 'title': 'Sub Section Header',
                 'paragraphs': ['Content 3'],
                 'children': [],
                 'footnote_refs': [],
                 'page': 83
             },
             {
                 'title': 'Another',
                 'paragraphs': ['Content 4'],
                 'children': [],
                 'footnote_refs': [],
                 'page': 83
             }],
         'page': 83
         })
     self.assertEqual(structures[1], {
         'title': '4(b) Header',
         'paragraphs': ['Content 5', 'Content 6'],
         'labels': ['100-4-b'],
         'page': 83,
         'footnote_refs': [],
         'children': []
         })
コード例 #11
0
 def test_build_section_by_section_emphasis(self):
     xml = """
     <ROOT>
         <HD SOURCE="H2">Section 876.23 Title Here</HD>
         <P>This sentence has<E T="03">emphasis</E>!</P>
         <P>Non emph,<E T="03">emph</E>then more.</P>
         <P>This one has an <E T="03">emph</E> with spaces.</P>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 23, '876')
     paragraphs = structures[0]['paragraphs']
     self.assertEqual(paragraphs, [
         'This sentence has <em data-original="E-03">emphasis</em>!',
         'Non emph, <em data-original="E-03">emph</em> then more.',
         'This one has an <em data-original="E-03">emph</em> with spaces.'
     ])
コード例 #12
0
 def test_build_section_by_section_footnotes_full(self):
     xml = """
     <ROOT>
         <HD SOURCE="H2">Section 876.23 Title Here</HD>
         <P>Sometimes<E T="03">citations</E><SU>5</SU><FTREF /></P>
         <P>Are rather complicated</P>
         <FTNT><P><SU>5</SU>Footnote contents</P></FTNT>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 23, '876')
     sometimes_txt = 'Sometimes <em data-original="E-03">citations</em>'
     self.assertEqual(structures[0]['paragraphs'], [
         sometimes_txt, 'Are rather complicated'
     ])
     self.assertEqual(structures[0]['footnote_refs'],
                      [{'paragraph': 0,
                        'reference': '5',
                        'offset': len(sometimes_txt)}])
コード例 #13
0
 def test_build_section_by_section_footnotes(self):
     """We only account for paragraph tags right now"""
     xml = """
     <ROOT>
         <HD SOURCE="HD3">Section Header</HD>
         <P>Content 1</P>
         <FTNT>Content A</FTNT>
         <P>Content 2</P>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 21, '100')
     self.assertEqual(1, len(structures))
     self.assertEqual(structures[0], {
         'title': 'Section Header',
         'paragraphs': [
             'Content 1',
             'Content 2',
             ],
         'children': [],
         'footnote_refs': [],
         'page': 21
         })
コード例 #14
0
 def test_build_section_by_section_extra_tags(self):
     """Check that labels are being added correctly"""
     xml = """
     <ROOT>
         <HD SOURCE="HD2">Section 99.3 Info</HD>
         <P>Content<PRTPAGE P="50249"/>1</P>
         <P>Content <SU>99</SU><FTREF />2</P>
         <P>Content <E T="03">Emph</E></P>
     </ROOT>"""
     sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*"))
     structures = sxs.build_section_by_section(sxs_lst, 939, '99')
     self.assertEqual(1, len(structures))
     self.assertEqual(structures[0], {
         'title': 'Section 99.3 Info',
         'labels': ['99-3'],
         'page': 939,
         'paragraphs': ['Content 1', 'Content 2',
                        'Content <em data-original="E-03">Emph</em>'],
         'footnote_refs': [{'paragraph': 1,
                            'reference': '99',
                            'offset': 8}],
         'children': []
     })
コード例 #15
0
ファイル: build.py プロジェクト: jposi/regulations-parser
def process_sxs(notice, notice_xml):
    """ Find and build SXS from the notice_xml. """
    sxs = find_section_by_section(notice_xml)
    sxs = build_section_by_section(sxs, notice['cfr_part'],
                                   notice['meta']['start_page'])
    notice['section_by_section'] = sxs