def ecfr_notice(title, cfr_part, notice, applies_to, act_title, act_section, with_version=False, without_notice=False): """ Generate RegML for a single notice from eCFR XML. """ # Get the notice the new one applies to with open(find_file(os.path.join(cfr_part, applies_to)), 'r') as f: reg_xml = f.read() parser = etree.XMLParser(huge_tree=True) xml_tree = etree.fromstring(reg_xml, parser) doc_number = xml_tree.find('.//{eregs}documentNumber').text # Validate the file relative to schema validator = get_validator(xml_tree) # Get the ecfr builder builder = Builder(cfr_title=title, cfr_part=cfr_part, doc_number=doc_number, checkpointer=None, writer_type='XML') # Fetch the notices from the FR API and find the notice we're # looking for builder.fetch_notices_json() print([n['document_number'] for n in builder.notices_json]) notice_json = next((n for n in builder.notices_json if n['document_number'] == notice)) # Build the notice notice = builder.build_single_notice(notice_json)[0] if 'changes' not in notice: print('There are no changes in this notice to apply.') return # We've successfully fetched and parsed the new notice. # Build a the reg tree and layers for the notice it applies to. old_tree = build_reg_tree(xml_tree) # Build the new reg tree from the old_tree + notice changes last_version = doc_number version = notice['document_number'] merged_changes = builder.merge_changes(version, notice['changes']) reg_tree = compile_regulation(old_tree, merged_changes) layer_cache = LayerCacheAggregator() layers = builder.generate_layers(reg_tree, [act_title, act_section], layer_cache) # Write the notice file if not without_notice: builder.write_notice(version, old_tree=old_tree, reg_tree=reg_tree, layers=layers, last_version=last_version) # Write the regulation file for the new notice if with_version: builder.write_regulation(new_tree, layers=layers)
def parse_regulation(args): """ Run the parser on the specified command-line arguments. Broken out into separate function to assist in profiling. """ with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() doc_number = args.notice act_title_and_section = [args.act_title, args.act_section] # First, the regulation tree reg_tree = Builder.reg_tree(reg) builder = Builder(cfr_title=args.title, cfr_part=reg_tree.label_id(), doc_number=doc_number) builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(doc_number, reg_tree, act_title_and_section, builder, layer_cache)
def test_determine_doc_number_annual(self, fetch_notice_json): """Verify that a document number can be pulled out of an annual edition of the reg""" fetch_notice_json.return_value = [{ 'el': 1, 'document_number': '111-111' }, { 'el': 2, 'document_number': '222-222' }] xml_str = """<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="cfr.xsl"?> <CFRGRANULE xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="CFRMergedXML.xsd"> <FDSYS> <CFRTITLE>12</CFRTITLE> <DATE>2013-01-01</DATE> <ORIGINALDATE>2012-01-01</ORIGINALDATE> </FDSYS> </CFRGRANULE>""" self.assertEqual('111-111', Builder.determine_doc_number(xml_str, '12', '34')) args = fetch_notice_json.call_args self.assertEqual(('12', '34'), args[0]) # positional args self.assertEqual( { 'max_effective_date': '2012-01-01', 'only_final': True }, args[1]) # kw args
def test_determine_doc_number_annual(self, fetch_notice_json): """The _latest_ document number pre-effective date should be pulled out of an annual edition of the reg""" fetch_notice_json.return_value = [ {'document_number': '111-111', 'effective_on': '2011-01-01', 'publication_date': '2011-01-01'}, {'document_number': '222-222', 'effective_on': '2011-10-20', 'publication_date': '2011-02-02'}, {'document_number': '333-333', 'effective_on': '2011-10-20', 'publication_date': '2011-03-03'}, {'document_number': '444-444', 'effective_on': '2011-04-04', 'publication_date': '2011-04-04'}] xml_str = """<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="cfr.xsl"?> <CFRGRANULE xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="CFRMergedXML.xsd"> <FDSYS> <CFRTITLE>12</CFRTITLE> <DATE>2013-01-01</DATE> <ORIGINALDATE>2012-01-01</ORIGINALDATE> </FDSYS> </CFRGRANULE>""" xml = etree.fromstring(xml_str) self.assertEqual( '333-333', Builder.determine_doc_number(xml, '12', '34')) args = fetch_notice_json.call_args self.assertEqual(('12', '34'), args[0]) # positional args self.assertEqual({'max_effective_date': '2012-01-01', 'only_final': True}, args[1]) # kw args
def test_changes_in_sequence_skips(self, init, merge_changes): """Skips over notices which occurred _before_ our starting point""" init.return_value = None b = Builder() # Don't need parameters as init's been mocked out aaaa = {'document_number': 'aaaa', 'effective_on': '2012-12-12', 'publication_date': '2011-11-11', 'changes': []} bbbb = {'document_number': 'bbbb', 'effective_on': '2012-12-12', 'publication_date': '2011-11-12', 'changes': []} cccc = {'document_number': 'cccc', 'effective_on': '2013-01-01', 'publication_date': '2012-01-01', 'changes': []} b.eff_notices = {'2012-12-12': [aaaa, bbbb], '2013-01-01': [cccc]} b.doc_number = bbbb['document_number'] changes = list(b.changes_in_sequence()) self.assertEqual(len(changes), 1) self.assertEqual(cccc['document_number'], changes[0][0]) self.assertEqual(cccc['document_number'], merge_changes.call_args[0][0])
def test_determine_doc_number_fr(self): """Verify that a document number can be pulled out of an FR notice""" xml_str = """ <RULE> <FRDOC>[FR Doc. 2011-31715 Filed 12-21-11; 8:45 am]</FRDOC> <BILCOD>BILLING CODE 4810-AM-P</BILCOD> </RULE>""" self.assertEqual('2011-31715', Builder.determine_doc_number(xml_str, '00', '00'))
def test_determine_doc_number_fr(self): """Verify that a document number can be pulled out of an FR notice""" xml_str = """ <RULE> <FRDOC>[FR Doc. 2011-31715 Filed 12-21-11; 8:45 am]</FRDOC> <BILCOD>BILLING CODE 4810-AM-P</BILCOD> </RULE>""" self.assertEqual( '2011-31715', Builder.determine_doc_number(xml_str, '00', '00'))
def test_revision_generator_notices(self, init, merge_changes): init.return_value = None b = Builder() # Don't need parameters as init's been mocked out aaaa = {'document_number': 'aaaa', 'effective_on': '2012-12-12', 'publication_date': '2011-11-11', 'changes': []} bbbb = {'document_number': 'bbbb', 'effective_on': '2012-12-12', 'publication_date': '2011-11-12', 'changes': []} cccc = {'document_number': 'cccc', 'effective_on': '2013-01-01', 'publication_date': '2012-01-01', 'changes': []} b.notices = [aaaa, bbbb, cccc] b.eff_notices = {'2012-12-12': [aaaa, bbbb], '2013-01-01': [cccc]} b.doc_number = 'aaaa' tree = Node(label=['1111']) version_list = [] notice_lists = [] for notice, _, _, notices in b.revision_generator(tree): version_list.append(notice['document_number']) notice_lists.append(notices) self.assertEqual(['bbbb', 'cccc'], version_list) self.assertEqual(2, len(notice_lists)) self.assertEqual(2, len(notice_lists[0])) self.assertTrue(aaaa in notice_lists[0]) self.assertTrue(bbbb in notice_lists[0]) self.assertEqual(3, len(notice_lists[1])) self.assertTrue(aaaa in notice_lists[1]) self.assertTrue(bbbb in notice_lists[1]) self.assertTrue(cccc in notice_lists[1])
def parse_regulation(args): """ Run the parser on the specified command-line arguments. Broken out into separate function to assist in profiling. """ with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() act_title_and_section = [args.act_title, args.act_section] if args.checkpoint: checkpointer = Checkpointer(args.checkpoint) else: checkpointer = NullCheckpointer() # First, the regulation tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() doc_number = checkpointer.checkpoint( "doc-number-" + file_digest, lambda: Builder.determine_doc_number(reg, args.title, title_part)) if not doc_number: raise ValueError("Could not determine document number") checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # Run Builder builder = Builder(cfr_title=args.title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(doc_number, reg_tree, act_title_and_section, builder, layer_cache, checkpointer)
def test_determine_doc_number_annual(self, fetch_notice_json): """Verify that a document number can be pulled out of an annual edition of the reg""" fetch_notice_json.return_value = [ {'el': 1, 'document_number': '111-111'}, {'el': 2, 'document_number': '222-222'}] xml_str = """<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="cfr.xsl"?> <CFRGRANULE xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="CFRMergedXML.xsd"> <FDSYS> <CFRTITLE>12</CFRTITLE> <DATE>2013-01-01</DATE> <ORIGINALDATE>2012-01-01</ORIGINALDATE> </FDSYS> </CFRGRANULE>""" self.assertEqual( '111-111', Builder.determine_doc_number(xml_str, '12', '34')) args = fetch_notice_json.call_args self.assertEqual(('12', '34'), args[0]) # positional args self.assertEqual({'max_effective_date': '2012-01-01', 'only_final': True}, args[1]) # kw args
args = parser.parse_args() with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() act_title_and_section = [args.act_title, args.act_section] if args.checkpoint: checkpointer = Checkpointer(args.checkpoint) else: checkpointer = NullCheckpointer() # First, the regulation tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() doc_number = checkpointer.checkpoint( "doc-number-" + file_digest, lambda: Builder.determine_doc_number(reg, args.title, title_part)) if not doc_number: raise ValueError("Could not determine document number") checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # Run Builder builder = Builder(cfr_title=args.title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.write_notices()
parser.add_argument('--generate-diffs', type=bool, help='Generate diffs?', required=False, default=True) args = parser.parse_args() with codecs.open(args.filename, 'r', 'utf-8') as f: reg = f.read() doc_number = args.notice act_title_and_section = [args.act_title, args.act_section] # First, the regulation tree reg_tree = Builder.reg_tree(reg) builder = Builder(cfr_title=args.title, cfr_part=reg_tree.label_id(), doc_number=doc_number) builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) # this used to assume implicitly that if gen-diffs was not specified it was
def build_by_notice(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): with codecs.open(filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() if checkpoint: checkpointer = Checkpointer(checkpoint) else: checkpointer = NullCheckpointer() # build the initial tree reg_tree = checkpointer.checkpoint( "init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() if doc_number is None: doc_number = Builder.determine_doc_number(reg, title, title_part) checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # create the builder builder = Builder(cfr_title=title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.fetch_notices_json() for notice in notice_doc_numbers: builder.build_notice_from_doc_number(notice) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() act_title_and_section = [act_title, act_section] builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
def build_by_notice(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): with codecs.open(filename, 'r', 'utf-8') as f: reg = f.read() file_digest = hashlib.sha256(reg.encode('utf-8')).hexdigest() if checkpoint: checkpointer = Checkpointer(checkpoint) else: checkpointer = NullCheckpointer() # build the initial tree reg_tree = checkpointer.checkpoint("init-tree-" + file_digest, lambda: Builder.reg_tree(reg)) title_part = reg_tree.label_id() if doc_number is None: doc_number = Builder.determine_doc_number(reg, title, title_part) checkpointer.suffix = ":".join( ["", title_part, str(args.title), doc_number]) # create the builder builder = Builder(cfr_title=title, cfr_part=title_part, doc_number=doc_number, checkpointer=checkpointer) builder.fetch_notices_json() for notice in notice_doc_numbers: builder.build_notice_from_doc_number(notice) builder.write_regulation(reg_tree) layer_cache = LayerCacheAggregator() act_title_and_section = [act_title, act_section] builder.gen_and_write_layers(reg_tree, act_title_and_section, layer_cache) layer_cache.replace_using(reg_tree) if args.generate_diffs: generate_diffs(reg_tree, act_title_and_section, builder, layer_cache)
if __name__ == "__main__": if len(sys.argv) < 6: print("Usage: python build_from.py regulation.xml title " + "notice_doc_# act_title act_section (Generate diffs? " + "True/False)") print(" e.g. python build_from.py rege.txt 12 2011-31725 15 1693 " + "False") exit() with codecs.open(sys.argv[1], 'r', 'utf-8') as f: reg = f.read() doc_number = sys.argv[3] # First, the regulation tree reg_tree = Builder.reg_tree(reg) builder = Builder(cfr_title=int(sys.argv[2]), cfr_part=reg_tree.label_id(), doc_number=doc_number) # Didn't include the provided version if not any(n['document_number'] == doc_number for n in builder.notices): print "Could not find notice_doc_#, %s" % doc_number exit() builder.write_notices() # Always do at least the first reg logger.info("Version %s", doc_number) builder.write_regulation(reg_tree)
def test_layer_cache(self, init): """Integration test for layer caching""" init.return_value = None cache = LayerCacheAggregator() b = Builder() # Don't need parameters as init's been mocked out b.cfr_title, b.cfr_part, b.doc_number = 15, '111', '111-222' b.writer = Mock() write = b.writer.layer.return_value.write tree = Node(label=["1234"], children=[ Node(label=["1234", "1"], children=[ Node("See paragraph (b)", label=["1234", "1", "a"]), Node("This is b", label=["1234", "1", "b"])])]) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) cache.replace_using(tree) write.reset_mock() tree.children[0].children[1].text = "References paragraph (a)" b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() tree.children[0].children[0].text = "Contains no references" b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() notice = {'document_number': '111-222'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a'], arg.keys()) write.reset_mock() notice['changes'] = {'1234-1-b': 'some change'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-a', '1234-1-b'], list(sorted(arg.keys()))) write.reset_mock() notice['changes'] = {'1234-Subpart-A': 'some change'} cache.invalidate_by_notice(notice) b.gen_and_write_layers(tree, [], cache, []) arg = write.call_args_list[3][0][0] self.assertEqual(['1234-1-b'], list(sorted(arg.keys())))