def test_build_section_by_section_repeat_label(self): xml = """ <ROOT> <HD SOURCE="H2">This references 23(c)</HD> <P>Content 1</P> <HD SOURCE="H3">SO DOES THIS! 23(c) continued</HD> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') self.assertEqual(len(structures), 1) struct1 = structures[0] self.assertEqual(struct1['labels'], ['876-23-c']) self.assertEqual(['Content 1'], struct1['paragraphs']) self.assertEqual(len(struct1['children']), 1) struct2 = struct1['children'][0] self.assertEqual(['Content 2'], struct2['paragraphs']) self.assertFalse('labels' in struct2) # Now the same, but on the same H level xml = """ <ROOT> <HD SOURCE="H2">This references 23(c)</HD> <P>Content 1</P> <HD SOURCE="H2">SO DOES THIS! 23(c) continued</HD> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') self.assertEqual(len(structures), 1) struct1 = structures[0] self.assertEqual(struct1['labels'], ['876-23-c']) self.assertEqual(['Content 1'], struct1['paragraphs']) self.assertEqual(len(struct1['children']), 1) struct2 = struct1['children'][0] self.assertEqual(['Content 2'], struct2['paragraphs']) self.assertFalse('labels' in struct2) # Semi-repeated xml = """ <ROOT> <HD SOURCE="H2">Appendices A and B</HD> <P>Content 1</P> <HD SOURCE="H2">Appendix B</HD> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') self.assertEqual(len(structures), 1) struct1 = structures[0] self.assertEqual(struct1['labels'], ['876-A', '876-B']) self.assertEqual(['Content 1'], struct1['paragraphs']) self.assertEqual(len(struct1['children']), 1) struct2 = struct1['children'][0] self.assertEqual(['Content 2'], struct2['paragraphs']) self.assertFalse('labels' in struct2)
def test_build_section_by_section_backtrack(self): xml = """ <ROOT> <HD SOURCE="H2">This references 23(c)(3)</HD> <P>Content 1</P> <HD SOURCE="H2">Off handed comment about 23(c)</HD> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') self.assertEqual(len(structures), 1) struct1 = structures[0] self.assertEqual(struct1['labels'], ['876-23-c-3']) self.assertEqual(['Content 1'], struct1['paragraphs']) self.assertEqual(len(struct1['children']), 1) struct2 = struct1['children'][0] self.assertEqual(['Content 2'], struct2['paragraphs']) self.assertFalse('labels' in struct2) # Same, but deeper H level xml = """ <ROOT> <HD SOURCE="H2">This references 23(c)(3)</HD> <P>Content 1</P> <HD SOURCE="H3">Off handed comment about 23(c)</HD> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') self.assertEqual(len(structures), 1) struct1 = structures[0] self.assertEqual(struct1['labels'], ['876-23-c-3']) self.assertEqual(['Content 1'], struct1['paragraphs']) self.assertEqual(len(struct1['children']), 1) struct2 = struct1['children'][0] self.assertEqual(['Content 2'], struct2['paragraphs']) self.assertFalse('labels' in struct2) # No part then part xml = """ <ROOT> <HD SOURCE="H3">This references 23(c)</HD> <HD SOURCE="H3">Off handed comment about section 1111.23</HD> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 22, '1111') self.assertEqual(len(structures), 1) struct1 = structures[0] self.assertEqual(struct1['labels'], ['1111-23-c']) self.assertEqual([], struct1['paragraphs']) self.assertEqual(len(struct1['children']), 1) struct2 = struct1['children'][0] self.assertEqual(['Content 2'], struct2['paragraphs']) self.assertFalse('labels' in struct2)
def process_xml(notice, notice_xml): """Pull out relevant fields from the xml and add them to the notice""" xml_chunk = notice_xml.xpath('//FURINF/P') if xml_chunk: notice['contact'] = xml_chunk[0].text addresses = fetch_addresses(notice_xml) if addresses: notice['addresses'] = addresses sxs = find_section_by_section(notice_xml) sxs = build_section_by_section(sxs, notice['cfr_part'], notice['meta']['start_page']) notice['section_by_section'] = sxs context = [] amends = [] for par in notice_xml.xpath('//AMDPAR'): amend_set, context = parse_amdpar(par, context) amends.extend(amend_set) if amends: notice['amendments'] = amends return notice
def test_build_section_by_section_dup_child(self): xml = """ <ROOT> <HD SOURCE="H2">References 31(a) and (b)</HD> <P>Content 1</P> <HD SOURCE="H3">Subcontent</HD> <P>Content 2</P> <HD SOURCE="H3">References 31(b)(1)</HD> <P>Content 3</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') self.assertEqual(len(structures), 1) struct1 = structures[0] self.assertEqual(struct1['labels'], ['876-31-a', '876-31-b']) self.assertEqual(['Content 1'], struct1['paragraphs']) self.assertEqual(len(struct1['children']), 2) struct1_h, struct2 = struct1['children'] self.assertEqual(struct1_h['title'], 'Subcontent') self.assertEqual(['Content 2'], struct1_h['paragraphs']) self.assertEqual(len(struct1_h['children']), 0) self.assertEqual(struct2['labels'], ['876-31-b-1']) self.assertEqual(['Content 3'], struct2['paragraphs']) self.assertEqual(len(struct2['children']), 0)
def test_build_section_by_section_same_level(self): """Check that labels are being added correctly""" xml = """ <ROOT> <HD SOURCE="HD2">Section 99.3 Something Here</HD> <HD SOURCE="HD3">3(q)(4) More Info</HD> <P>Content 1</P> <HD SOURCE="HD3">Subheader, Really</HD> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 765, '99') self.assertEqual(1, len(structures)) self.assertEqual(structures[0], { 'title': 'Section 99.3 Something Here', 'labels': ['99-3'], 'paragraphs': [], 'page': 765, 'footnote_refs': [], 'children': [{ 'title': '3(q)(4) More Info', 'labels': ['99-3-q-4'], 'paragraphs': ['Content 1'], 'page': 765, 'footnote_refs': [], 'children': [{ 'title': 'Subheader, Really', 'paragraphs': ['Content 2'], 'footnote_refs': [], 'children': [], 'page': 765 }] }] })
def test_build_section_by_section_extra_tags(self): """Check that labels are being added correctly""" xml = """ <ROOT> <HD SOURCE="HD2">Section 99.3 Info</HD> <P>Content<PRTPAGE P="50249"/>1</P> <P>Content <SU>99</SU><FTREF />2</P> <P>Content <E T="03">Emph</E></P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 939, '99') self.assertEqual(1, len(structures)) self.assertEqual( structures[0], { 'title': 'Section 99.3 Info', 'labels': ['99-3'], 'page': 939, 'paragraphs': [ 'Content 1', 'Content 2', 'Content <em data-original="E-03">Emph</em>' ], 'footnote_refs': [{ 'paragraph': 1, 'reference': '99', 'offset': 8 }], 'children': [] })
def process_sxs(notice, notice_xml): """ Find and build SXS from the notice_xml. """ sxs = find_section_by_section(notice_xml) # note we will continue to use cfr_parts[0] as the default SxS label until # we find a counter example sxs = build_section_by_section(sxs, notice['meta']['start_page'], notice['cfr_parts'][0]) notice['section_by_section'] = sxs
def test_build_section_by_section_multiple(self): xml = """ <ROOT> <HD SOURCE="H2">Comments 22(a)-5, 22(a)-6, and 22(b)</HD> <P>Content</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') self.assertEqual(len(structures), 1) self.assertEqual(structures[0]['labels'], ['876-22-a-Interp-5', '876-22-a-Interp-6', '876-22-b-Interp'])
def test_build_section_by_section(self): xml = """ <ROOT> <HD SOURCE="HD3">Section Header</HD> <P>Content 1</P> <P>Content 2</P> <HD SOURCE="HD4">Sub Section Header</HD> <P>Content 3</P> <HD SOURCE="HD4">Another</HD> <P>Content 4</P> <HD SOURCE="HD3">4(b) Header</HD> <P>Content 5</P> <FP>Content 6</FP> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 83, '100') self.assertEqual(2, len(structures)) self.assertEqual(structures[0], { 'title': 'Section Header', 'paragraphs': [ 'Content 1', 'Content 2' ], 'footnote_refs': [], 'children': [ { 'title': 'Sub Section Header', 'paragraphs': ['Content 3'], 'children': [], 'footnote_refs': [], 'page': 83 }, { 'title': 'Another', 'paragraphs': ['Content 4'], 'children': [], 'footnote_refs': [], 'page': 83 }], 'page': 83 }) self.assertEqual(structures[1], { 'title': '4(b) Header', 'paragraphs': ['Content 5', 'Content 6'], 'labels': ['100-4-b'], 'page': 83, 'footnote_refs': [], 'children': [] })
def test_build_section_by_section_emphasis(self): xml = """ <ROOT> <HD SOURCE="H2">Section 876.23 Title Here</HD> <P>This sentence has<E T="03">emphasis</E>!</P> <P>Non emph,<E T="03">emph</E>then more.</P> <P>This one has an <E T="03">emph</E> with spaces.</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') paragraphs = structures[0]['paragraphs'] self.assertEqual(paragraphs, [ 'This sentence has <em data-original="E-03">emphasis</em>!', 'Non emph, <em data-original="E-03">emph</em> then more.', 'This one has an <em data-original="E-03">emph</em> with spaces.' ])
def test_build_section_by_section_footnotes_full(self): xml = """ <ROOT> <HD SOURCE="H2">Section 876.23 Title Here</HD> <P>Sometimes<E T="03">citations</E><SU>5</SU><FTREF /></P> <P>Are rather complicated</P> <FTNT><P><SU>5</SU>Footnote contents</P></FTNT> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 23, '876') sometimes_txt = 'Sometimes <em data-original="E-03">citations</em>' self.assertEqual(structures[0]['paragraphs'], [ sometimes_txt, 'Are rather complicated' ]) self.assertEqual(structures[0]['footnote_refs'], [{'paragraph': 0, 'reference': '5', 'offset': len(sometimes_txt)}])
def test_build_section_by_section_footnotes(self): """We only account for paragraph tags right now""" xml = """ <ROOT> <HD SOURCE="HD3">Section Header</HD> <P>Content 1</P> <FTNT>Content A</FTNT> <P>Content 2</P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 21, '100') self.assertEqual(1, len(structures)) self.assertEqual(structures[0], { 'title': 'Section Header', 'paragraphs': [ 'Content 1', 'Content 2', ], 'children': [], 'footnote_refs': [], 'page': 21 })
def test_build_section_by_section_extra_tags(self): """Check that labels are being added correctly""" xml = """ <ROOT> <HD SOURCE="HD2">Section 99.3 Info</HD> <P>Content<PRTPAGE P="50249"/>1</P> <P>Content <SU>99</SU><FTREF />2</P> <P>Content <E T="03">Emph</E></P> </ROOT>""" sxs_lst = list(etree.fromstring(xml).xpath("/ROOT/*")) structures = sxs.build_section_by_section(sxs_lst, 939, '99') self.assertEqual(1, len(structures)) self.assertEqual(structures[0], { 'title': 'Section 99.3 Info', 'labels': ['99-3'], 'page': 939, 'paragraphs': ['Content 1', 'Content 2', 'Content <em data-original="E-03">Emph</em>'], 'footnote_refs': [{'paragraph': 1, 'reference': '99', 'offset': 8}], 'children': [] })
def process_sxs(notice, notice_xml): """ Find and build SXS from the notice_xml. """ sxs = find_section_by_section(notice_xml) sxs = build_section_by_section(sxs, notice['cfr_part'], notice['meta']['start_page']) notice['section_by_section'] = sxs