def generate_diffs(reg_tree, act_title_and_section, builder, layer_cache): """ Generate all the diffs for the given regulation. Broken out into separate function to assist with profiling so it's easier to determine which parts of the parser take the most time """ doc_number, checkpointer = builder.doc_number, builder.checkpointer all_versions = {doc_number: FrozenNode.from_node(reg_tree)} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = FrozenNode.from_node(new_tree) builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, act_title_and_section, layer_cache, notices) layer_cache.replace_using(new_tree) del last_notice, old, new_tree, notices # free some memory label_id = reg_tree.label_id() writer = builder.writer del reg_tree, layer_cache, builder # free some memory # now build diffs - include "empty" diffs comparing a version to itself for lhs_version, lhs_tree in all_versions.iteritems(): for rhs_version, rhs_tree in all_versions.iteritems(): changes = checkpointer.checkpoint( "-".join(["diff", lhs_version, rhs_version]), lambda: dict(changes_between(lhs_tree, rhs_tree))) writer.diff( label_id, lhs_version, rhs_version ).write(changes)
def write_notice(self, doc_number, old_tree=None, reg_tree=None, layers=None, last_version=''): """ Write a single notice out. For the XMLWriter, we need to include the reg_tree for the notice. """ # Get the notice by doc number notice = next((n for n in self.notices if n['document_number'] == doc_number), None) # We can optionall write out the diffs with the notice if we're # given the old tree. changes = {} if old_tree is not None and reg_tree is not None: # FrozenNode and Node are not API-compatible. This is # troublesome. changes = dict(changes_between( FrozenNode.from_node(old_tree), FrozenNode.from_node(reg_tree))) # Write the notice writer = self.writer.notice(self.cfr_part, self.doc_number, notices=self.notices, layers=layers) writer.write(notice, changes=changes, reg_tree=reg_tree, left_doc_number=last_version)
def generate_diffs(reg_tree, act_title_and_section, builder, layer_cache): """ Generate all the diffs for the given regulation. Broken out into separate function to assist with profiling so it's easier to determine which parts of the parser take the most time """ doc_number, checkpointer = builder.doc_number, builder.checkpointer all_versions = {doc_number: FrozenNode.from_node(reg_tree)} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = FrozenNode.from_node(new_tree) builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, act_title_and_section, layer_cache, notices) layer_cache.replace_using(new_tree) del last_notice, old, new_tree, notices # free some memory label_id = reg_tree.label_id() writer = builder.writer del reg_tree, layer_cache, builder # free some memory # now build diffs - include "empty" diffs comparing a version to itself for lhs_version, lhs_tree in all_versions.iteritems(): for rhs_version, rhs_tree in all_versions.iteritems(): changes = checkpointer.checkpoint( "-".join(["diff", lhs_version, rhs_version]), lambda: dict(changes_between(lhs_tree, rhs_tree))) writer.diff(label_id, lhs_version, rhs_version).write(changes)
def write_notice(self, doc_number, old_tree=None, reg_tree=None, layers=None, last_version=''): """ Write a single notice out. For the XMLWriter, we need to include the reg_tree for the notice. """ # Get the notice by doc number notice = next( (n for n in self.notices if n['document_number'] == doc_number), None) # We can optionall write out the diffs with the notice if we're # given the old tree. changes = {} if old_tree is not None and reg_tree is not None: # FrozenNode and Node are not API-compatible. This is # troublesome. changes = dict( changes_between(FrozenNode.from_node(old_tree), FrozenNode.from_node(reg_tree))) # Write the notice writer = self.writer.notice(self.cfr_part, self.doc_number, notices=self.notices, layers=layers) writer.write(notice, changes=changes, reg_tree=reg_tree, left_doc_number=last_version)
def generate_diff(left_xml, right_xml): """ Given two full RegML trees, generate a dictionary of changes between the two in the style of regulations-parser. This wraps regulatons-parser's changes_between() function. """ left_tree = build_reg_tree(left_xml) right_tree = build_reg_tree(right_xml) diff = dict(changes_between(FrozenNode.from_node(left_tree), FrozenNode.from_node(right_tree))) return diff
def generate_diff(left_xml, right_xml): """ Given two full RegML trees, generate a dictionary of changes between the two in the style of regulations-parser. This wraps regulatons-parser's changes_between() function. """ left_tree = build_reg_tree(left_xml) right_tree = build_reg_tree(right_xml) diff = dict( changes_between(FrozenNode.from_node(left_tree), FrozenNode.from_node(right_tree))) return diff
def test_subparts(self): """ Create a tree with no subparts, then add subparts. """ title = u"Regulation Title" sect1_title = u"§ 204.1 First Section" sect1 = u"(a) I believe this is (b) the best section " sect2_title = u"§ 204.2 Second Section" sect2 = u"Some sections \ndon't have \ndepth at all." old_text = "\n".join([title, sect1_title, sect1, sect2_title, sect2]) older = reg_text.build_reg_text_tree(old_text, 204) ntitle = u"Regulation Title" nsubpart_a = u"Subpart A—First subpart" nsect1_title = u"§ 204.1 First Section" nsect1 = u"(a) I believe this is (b) the best section " nsubpart_b = u"Subpart B—Second subpart" nsect2_title = u"§ 204.2 Second Section" nsect2 = u"Some sections \ndon't have \ndepth at all." new_text = "\n".join([ ntitle, nsubpart_a, nsect1_title, nsect1, nsubpart_b, nsect2_title, nsect2 ]) newer = reg_text.build_reg_text_tree(new_text, 204) result = dict( difftree.changes_between(FrozenNode.from_node(older), FrozenNode.from_node(newer))) self.assertEquals( result['204-Subpart-A'], { "node": { "text": u"", "node_type": u"subpart", "tagged_text": None, "label": ("204", "Subpart", "A"), "child_labels": ("204-1", ), "title": u"First subpart" }, "op": "added" }) self.assertTrue('204-Subpart-B' in result) self.assertEquals(result['204-Subpart'], {"op": "deleted"}) # Sections shouldn't have changed, though self.assertFalse('204-1' in result) self.assertFalse('204-2' in result)
def test_subparts(self): """ Create a tree with no subparts, then add subparts. """ title = u"Regulation Title" sect1_title = u"§ 204.1 First Section" sect1 = u"(a) I believe this is (b) the best section " sect2_title = u"§ 204.2 Second Section" sect2 = u"Some sections \ndon't have \ndepth at all." old_text = "\n".join([title, sect1_title, sect1, sect2_title, sect2]) older = reg_text.build_reg_text_tree(old_text, 204) ntitle = u"Regulation Title" nsubpart_a = u"Subpart A—First subpart" nsect1_title = u"§ 204.1 First Section" nsect1 = u"(a) I believe this is (b) the best section " nsubpart_b = u"Subpart B—Second subpart" nsect2_title = u"§ 204.2 Second Section" nsect2 = u"Some sections \ndon't have \ndepth at all." new_text = "\n".join([ntitle, nsubpart_a, nsect1_title, nsect1, nsubpart_b, nsect2_title, nsect2]) newer = reg_text.build_reg_text_tree(new_text, 204) result = dict(difftree.changes_between(FrozenNode.from_node(older), FrozenNode.from_node(newer))) self.assertEquals( result["204-Subpart-A"], { "node": { "text": u"", "node_type": u"subpart", "tagged_text": None, "label": ("204", "Subpart", "A"), "child_labels": ("204-1",), "title": u"First subpart", }, "op": "added", }, ) self.assertTrue("204-Subpart-B" in result) self.assertEquals(result["204-Subpart"], {"op": "deleted"}) # Sections shouldn't have changed, though self.assertFalse("204-1" in result) self.assertFalse("204-2" in result)
def generate_xml(filename, title, act_title, act_section, notice_doc_numbers, doc_number=None, checkpoint=None): act_title_and_section = [act_title, act_section] # First, the regulation tree reg_tree, builder = tree_and_builder(filename, title, checkpoint, writer_type='XML') layer_cache = LayerCacheAggregator() layers = builder.generate_layers(reg_tree, act_title_and_section, layer_cache) # Always do at least the first reg logger.info("Version", builder.doc_number) builder.write_regulation(reg_tree, layers=layers) all_versions = {doc_number: FrozenNode.from_node(reg_tree)} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = FrozenNode.from_node(new_tree) builder.doc_number = version layers = builder.generate_layers(new_tree, act_title_and_section, layer_cache, notices) builder.write_regulation(new_tree, layers=layers) builder.write_notice(version, old_tree=old, reg_tree=new_tree, layers=layers) layer_cache.invalidate_by_notice(last_notice) layer_cache.replace_using(new_tree) del last_notice, old, new_tree, notices # free some memory
# this used to assume implicitly that if gen-diffs was not specified it was # True; changed it to explicit check if args.generate_diffs: all_versions = {doc_number: reg_tree} for last_notice, old, new_tree, notices in builder.revision_generator( reg_tree): version = last_notice['document_number'] logger.info("Version %s", version) all_versions[version] = new_tree builder.doc_number = version builder.write_regulation(new_tree) layer_cache.invalidate_by_notice(last_notice) builder.gen_and_write_layers(new_tree, act_title_and_section, layer_cache, notices) layer_cache.replace_using(new_tree) # convert to frozen trees for doc in all_versions: all_versions[doc] = FrozenNode.from_node(all_versions[doc]) # now build diffs - include "empty" diffs comparing a version to itself for lhs_version, lhs_tree in all_versions.iteritems(): for rhs_version, rhs_tree in all_versions.iteritems(): changes = checkpointer.checkpoint( "-".join(["diff", lhs_version, rhs_version]), lambda: dict(changes_between(lhs_tree, rhs_tree))) builder.writer.diff( reg_tree.label_id(), lhs_version, rhs_version ).write(changes)