コード例 #1
0
def generate_diffs(reg_tree, act_title_and_section, builder, layer_cache):
    """ Generate all the diffs for the given regulation. Broken out into
        separate function to assist with profiling so it's easier to determine
        which parts of the parser take the most time """
    doc_number, checkpointer = builder.doc_number, builder.checkpointer
    all_versions = {doc_number: FrozenNode.from_node(reg_tree)}

    for last_notice, old, new_tree, notices in builder.revision_generator(
            reg_tree):
        version = last_notice['document_number']
        logger.info("Version %s", version)
        all_versions[version] = FrozenNode.from_node(new_tree)
        builder.doc_number = version
        builder.write_regulation(new_tree)
        layer_cache.invalidate_by_notice(last_notice)
        builder.gen_and_write_layers(new_tree, act_title_and_section,
                                     layer_cache, notices)
        layer_cache.replace_using(new_tree)
        del last_notice, old, new_tree, notices     # free some memory

    label_id = reg_tree.label_id()
    writer = builder.writer
    del reg_tree, layer_cache, builder  # free some memory

    # now build diffs - include "empty" diffs comparing a version to itself
    for lhs_version, lhs_tree in all_versions.iteritems():
        for rhs_version, rhs_tree in all_versions.iteritems():
            changes = checkpointer.checkpoint(
                "-".join(["diff", lhs_version, rhs_version]),
            lambda: dict(changes_between(lhs_tree, rhs_tree)))
            writer.diff(
                label_id, lhs_version, rhs_version
            ).write(changes)
コード例 #2
0
ファイル: builder.py プロジェクト: cfpb/regulations-parser
    def write_notice(self, doc_number, old_tree=None, reg_tree=None,
                     layers=None, last_version=''):
        """ Write a single notice out. For the XMLWriter, we need to
            include the reg_tree for the notice. """
        # Get the notice by doc number
        notice = next((n for n in self.notices
                       if n['document_number'] == doc_number), None)

        # We can optionall write out the diffs with the notice if we're
        # given the old tree.
        changes = {}
        if old_tree is not None and reg_tree is not None:
            # FrozenNode and Node are not API-compatible. This is
            # troublesome.
            changes = dict(changes_between(
                FrozenNode.from_node(old_tree),
                FrozenNode.from_node(reg_tree)))

        # Write the notice
        writer = self.writer.notice(self.cfr_part,
                                    self.doc_number,
                                    notices=self.notices,
                                    layers=layers)
        writer.write(notice, changes=changes, reg_tree=reg_tree,
                     left_doc_number=last_version)
コード例 #3
0
    def write_notice(self,
                     doc_number,
                     old_tree=None,
                     reg_tree=None,
                     layers=None,
                     last_version=''):
        """ Write a single notice out. For the XMLWriter, we need to
            include the reg_tree for the notice. """
        # Get the notice by doc number
        notice = next(
            (n for n in self.notices if n['document_number'] == doc_number),
            None)

        # We can optionall write out the diffs with the notice if we're
        # given the old tree.
        changes = {}
        if old_tree is not None and reg_tree is not None:
            # FrozenNode and Node are not API-compatible. This is
            # troublesome.
            changes = dict(
                changes_between(FrozenNode.from_node(old_tree),
                                FrozenNode.from_node(reg_tree)))

        # Write the notice
        writer = self.writer.notice(self.cfr_part,
                                    self.doc_number,
                                    notices=self.notices,
                                    layers=layers)
        writer.write(notice,
                     changes=changes,
                     reg_tree=reg_tree,
                     left_doc_number=last_version)
コード例 #4
0
def generate_diffs(reg_tree, act_title_and_section, builder, layer_cache):
    """ Generate all the diffs for the given regulation. Broken out into
        separate function to assist with profiling so it's easier to determine
        which parts of the parser take the most time """
    doc_number, checkpointer = builder.doc_number, builder.checkpointer
    all_versions = {doc_number: FrozenNode.from_node(reg_tree)}

    for last_notice, old, new_tree, notices in builder.revision_generator(
            reg_tree):
        version = last_notice['document_number']
        logger.info("Version %s", version)
        all_versions[version] = FrozenNode.from_node(new_tree)
        builder.doc_number = version
        builder.write_regulation(new_tree)
        layer_cache.invalidate_by_notice(last_notice)
        builder.gen_and_write_layers(new_tree, act_title_and_section,
                                     layer_cache, notices)
        layer_cache.replace_using(new_tree)
        del last_notice, old, new_tree, notices  # free some memory

    label_id = reg_tree.label_id()
    writer = builder.writer
    del reg_tree, layer_cache, builder  # free some memory

    # now build diffs - include "empty" diffs comparing a version to itself
    for lhs_version, lhs_tree in all_versions.iteritems():
        for rhs_version, rhs_tree in all_versions.iteritems():
            changes = checkpointer.checkpoint(
                "-".join(["diff", lhs_version, rhs_version]),
                lambda: dict(changes_between(lhs_tree, rhs_tree)))
            writer.diff(label_id, lhs_version, rhs_version).write(changes)
コード例 #5
0
def generate_diff(left_xml, right_xml):
    """ Given two full RegML trees, generate a dictionary of changes
        between the two in the style of regulations-parser.
        This wraps regulatons-parser's changes_between() function. """
    left_tree = build_reg_tree(left_xml)
    right_tree = build_reg_tree(right_xml)
    diff = dict(changes_between(FrozenNode.from_node(left_tree),
                                FrozenNode.from_node(right_tree)))
    return diff
コード例 #6
0
    def test_title_disappears(self):
        lhs = FrozenNode("Text", title="Some Title", label=['1111'])
        rhs = FrozenNode("Text", title=None, label=['1111'])

        result = dict(difftree.changes_between(lhs, rhs))
        self.assertEqual(result['1111'], {
            'title': [('delete', 0, 10)],
            'op': 'modified'
        })
コード例 #7
0
def generate_diff(left_xml, right_xml):
    """ Given two full RegML trees, generate a dictionary of changes
        between the two in the style of regulations-parser.
        This wraps regulatons-parser's changes_between() function. """
    left_tree = build_reg_tree(left_xml)
    right_tree = build_reg_tree(right_xml)
    diff = dict(
        changes_between(FrozenNode.from_node(left_tree),
                        FrozenNode.from_node(right_tree)))
    return diff
コード例 #8
0
 def test_child_added(self):
     """We should include child_ops if children were added"""
     lhs = FrozenNode("Root", label=['1111'], children=[
         FrozenNode("Child1", label=['1111', 'a'])])
     new_child = FrozenNode("Child2", label=['1111', 'b'])
     rhs = lhs.clone(children=lhs.children + (new_child,))
     result = dict(difftree.changes_between(lhs, rhs))
     self.assertEqual(
         result['1111'],
         {'op': 'modified', 'child_ops': [('equal', 0, 1),   # 1111-a
                                          ('insert', 1, ('1111-b',))]})
コード例 #9
0
 def test_child_order(self):
     """We should include child_ops if the order of children changed"""
     lhs = FrozenNode("Root", label=['1111'], children=[
         FrozenNode("Child1", label=['1111', 'a']),
         FrozenNode("Child2", label=['1111', 'b'])])
     rhs = lhs.clone(children=list(reversed(lhs.children)))
     result = dict(difftree.changes_between(lhs, rhs))
     self.assertEqual(
         result['1111'],
         # Note that these ops could change in other versions of difflib.
         {'op': 'modified', 'child_ops': [('insert', 0, ('1111-b',)),
                                          ('equal', 0, 1),  # 1111-a
                                          ('delete', 1, 2)]})
コード例 #10
0
 def test_child_removed_with_edit(self):
     """We should include child_ops if children were modified and the
     parent's text was modified"""
     lhs = FrozenNode("Root", label=['1111'], children=[
         FrozenNode("Child1", label=['1111', 'a']),
         FrozenNode("Child2", label=['1111', 'b'])])
     rhs = lhs.clone(children=lhs.children[:1], text="Root modified")
     result = dict(difftree.changes_between(lhs, rhs))
     self.assertEqual(
         result['1111'],
         {'op': 'modified',
          'text': [('insert', len("Root"), " modified")],
          'child_ops': [('equal', 0, 1),   # 1111-a
                        ('delete', 1, 2)]})
コード例 #11
0
    def test_subparts(self):
        """ Create a tree with no subparts, then add subparts. """
        title = u"Regulation Title"
        sect1_title = u"§ 204.1 First Section"
        sect1 = u"(a) I believe this is (b) the best section "
        sect2_title = u"§ 204.2 Second Section"
        sect2 = u"Some sections \ndon't have \ndepth at all."

        old_text = "\n".join([title, sect1_title, sect1, sect2_title, sect2])
        older = reg_text.build_reg_text_tree(old_text, 204)

        ntitle = u"Regulation Title"
        nsubpart_a = u"Subpart A—First subpart"
        nsect1_title = u"§ 204.1 First Section"
        nsect1 = u"(a) I believe this is (b) the best section "
        nsubpart_b = u"Subpart B—Second subpart"
        nsect2_title = u"§ 204.2 Second Section"
        nsect2 = u"Some sections \ndon't have \ndepth at all."

        new_text = "\n".join([
            ntitle, nsubpart_a, nsect1_title, nsect1, nsubpart_b, nsect2_title,
            nsect2
        ])
        newer = reg_text.build_reg_text_tree(new_text, 204)

        result = dict(
            difftree.changes_between(FrozenNode.from_node(older),
                                     FrozenNode.from_node(newer)))

        self.assertEquals(
            result['204-Subpart-A'], {
                "node": {
                    "text": u"",
                    "node_type": u"subpart",
                    "tagged_text": None,
                    "label": ("204", "Subpart", "A"),
                    "child_labels": ("204-1", ),
                    "title": u"First subpart"
                },
                "op": "added"
            })
        self.assertTrue('204-Subpart-B' in result)
        self.assertEquals(result['204-Subpart'], {"op": "deleted"})
        # Sections shouldn't have changed, though
        self.assertFalse('204-1' in result)
        self.assertFalse('204-2' in result)
コード例 #12
0
 def test_child_added(self):
     """We should include child_ops if children were added"""
     lhs = FrozenNode("Root",
                      label=['1111'],
                      children=[FrozenNode("Child1", label=['1111', 'a'])])
     new_child = FrozenNode("Child2", label=['1111', 'b'])
     rhs = lhs.clone(children=lhs.children + (new_child, ))
     result = dict(difftree.changes_between(lhs, rhs))
     self.assertEqual(
         result['1111'],
         {
             'op':
             'modified',
             'child_ops': [
                 ('equal', 0, 1),  # 1111-a
                 ('insert', 1, ('1111-b', ))
             ]
         })
コード例 #13
0
    def test_subparts(self):
        """ Create a tree with no subparts, then add subparts. """
        title = u"Regulation Title"
        sect1_title = u"§ 204.1 First Section"
        sect1 = u"(a) I believe this is (b) the best section "
        sect2_title = u"§ 204.2 Second Section"
        sect2 = u"Some sections \ndon't have \ndepth at all."

        old_text = "\n".join([title, sect1_title, sect1, sect2_title, sect2])
        older = reg_text.build_reg_text_tree(old_text, 204)

        ntitle = u"Regulation Title"
        nsubpart_a = u"Subpart A—First subpart"
        nsect1_title = u"§ 204.1 First Section"
        nsect1 = u"(a) I believe this is (b) the best section "
        nsubpart_b = u"Subpart B—Second subpart"
        nsect2_title = u"§ 204.2 Second Section"
        nsect2 = u"Some sections \ndon't have \ndepth at all."

        new_text = "\n".join([ntitle, nsubpart_a, nsect1_title, nsect1, nsubpart_b, nsect2_title, nsect2])
        newer = reg_text.build_reg_text_tree(new_text, 204)

        result = dict(difftree.changes_between(FrozenNode.from_node(older), FrozenNode.from_node(newer)))

        self.assertEquals(
            result["204-Subpart-A"],
            {
                "node": {
                    "text": u"",
                    "node_type": u"subpart",
                    "tagged_text": None,
                    "label": ("204", "Subpart", "A"),
                    "child_labels": ("204-1",),
                    "title": u"First subpart",
                },
                "op": "added",
            },
        )
        self.assertTrue("204-Subpart-B" in result)
        self.assertEquals(result["204-Subpart"], {"op": "deleted"})
        # Sections shouldn't have changed, though
        self.assertFalse("204-1" in result)
        self.assertFalse("204-2" in result)
コード例 #14
0
 def test_child_removed_with_edit(self):
     """We should include child_ops if children were modified and the
     parent's text was modified"""
     lhs = FrozenNode("Root",
                      label=['1111'],
                      children=[
                          FrozenNode("Child1", label=['1111', 'a']),
                          FrozenNode("Child2", label=['1111', 'b'])
                      ])
     rhs = lhs.clone(children=lhs.children[:1], text="Root modified")
     result = dict(difftree.changes_between(lhs, rhs))
     self.assertEqual(
         result['1111'],
         {
             'op': 'modified',
             'text': [('insert', len("Root"), " modified")],
             'child_ops': [
                 ('equal', 0, 1),  # 1111-a
                 ('delete', 1, 2)
             ]
         })
コード例 #15
0
def generate_xml(filename,
                 title,
                 act_title,
                 act_section,
                 notice_doc_numbers,
                 doc_number=None,
                 checkpoint=None):

    act_title_and_section = [act_title, act_section]
    #   First, the regulation tree

    reg_tree, builder = tree_and_builder(filename,
                                         title,
                                         checkpoint,
                                         writer_type='XML')
    layer_cache = LayerCacheAggregator()
    layers = builder.generate_layers(reg_tree, act_title_and_section,
                                     layer_cache)

    # Always do at least the first reg
    logger.info("Version", builder.doc_number)
    builder.write_regulation(reg_tree, layers=layers)
    all_versions = {doc_number: FrozenNode.from_node(reg_tree)}

    for last_notice, old, new_tree, notices in builder.revision_generator(
            reg_tree):
        version = last_notice['document_number']
        logger.info("Version %s", version)
        all_versions[version] = FrozenNode.from_node(new_tree)
        builder.doc_number = version
        layers = builder.generate_layers(new_tree, act_title_and_section,
                                         layer_cache, notices)
        builder.write_regulation(new_tree, layers=layers)
        builder.write_notice(version,
                             old_tree=old,
                             reg_tree=new_tree,
                             layers=layers)
        layer_cache.invalidate_by_notice(last_notice)
        layer_cache.replace_using(new_tree)
        del last_notice, old, new_tree, notices  # free some memory
コード例 #16
0
 def test_child_order(self):
     """We should include child_ops if the order of children changed"""
     lhs = FrozenNode("Root",
                      label=['1111'],
                      children=[
                          FrozenNode("Child1", label=['1111', 'a']),
                          FrozenNode("Child2", label=['1111', 'b'])
                      ])
     rhs = lhs.clone(children=list(reversed(lhs.children)))
     result = dict(difftree.changes_between(lhs, rhs))
     self.assertEqual(
         result['1111'],
         # Note that these ops could change in other versions of difflib.
         {
             'op':
             'modified',
             'child_ops': [
                 ('insert', 0, ('1111-b', )),
                 ('equal', 0, 1),  # 1111-a
                 ('delete', 1, 2)
             ]
         })
コード例 #17
0
    def test_subparts(self):
        """ Create a tree with no subparts, then add subparts. """
        old_tree = FrozenNode(
            title="Regulation Title",
            label=['204'],
            children=[
                FrozenNode(
                    node_type='emptypart',
                    label=['204', 'Subpart'],
                    children=[
                        FrozenNode(
                            title=u"§ 204.1 First Section",
                            label=['204', '1'],
                            children=[
                                FrozenNode(
                                    text=
                                    "(a) I believe this is the best section",
                                    label=['204', '1', 'a'])
                            ]),
                        FrozenNode(
                            title=u"§ 204.2 Second Section",
                            label=['204', '2'],
                            text=u"Some sections \ndon't have \ndepth at all.")
                    ])
            ])
        new_tree = FrozenNode(
            title="Regulation Title",
            label=['204'],
            children=[
                FrozenNode(
                    node_type='subpart',
                    label=['204', 'Subpart', 'A'],
                    title=u"Subpart A—First subpart",
                    children=[
                        FrozenNode(
                            title=u"§ 204.1 First Section",
                            label=['204', '1'],
                            children=[
                                FrozenNode(
                                    text=
                                    "(a) I believe this is the best section",
                                    label=['204', '1', 'a'])
                            ])
                    ]),
                FrozenNode(
                    node_type='subpart',
                    label=['204', 'Subpart', 'B'],
                    title=u"Subpart B—Second subpart",
                    children=[
                        FrozenNode(
                            title=u"§ 204.2 Second Section",
                            label=['204', '2'],
                            text=u"Some sections \ndon't have \ndepth at all.")
                    ])
            ])

        result = dict(difftree.changes_between(old_tree, new_tree))

        self.assertEquals(
            result['204-Subpart-A'], {
                "node": {
                    "text": u"",
                    "node_type": u"subpart",
                    "tagged_text": None,
                    "label": ("204", "Subpart", "A"),
                    "child_labels": ("204-1", ),
                    "title": u"Subpart A—First subpart"
                },
                "op": "added"
            })
        self.assertTrue('204-Subpart-B' in result)
        self.assertEquals(result['204-Subpart'], {"op": "deleted"})
        # Sections shouldn't have changed, though
        self.assertFalse('204-1' in result)
        self.assertFalse('204-2' in result)
コード例 #18
0
 def test_whitespace_comparison(self):
     """We shouldn't trigger diffs for whitespace changes"""
     lhs = FrozenNode(u"Some\t\nthing", label=['123'])
     rhs = lhs.clone(text=u"Some\u2009 thing")  # thin-space
     self.assertEqual(difftree.changes_between(lhs, rhs), [])
コード例 #19
0
    # this used to assume implicitly that if gen-diffs was not specified it was
    # True; changed it to explicit check
    if args.generate_diffs:
        all_versions = {doc_number: reg_tree}

        for last_notice, old, new_tree, notices in builder.revision_generator(
                reg_tree):
            version = last_notice['document_number']
            logger.info("Version %s", version)
            all_versions[version] = new_tree
            builder.doc_number = version
            builder.write_regulation(new_tree)
            layer_cache.invalidate_by_notice(last_notice)
            builder.gen_and_write_layers(new_tree, act_title_and_section,
                                         layer_cache, notices)
            layer_cache.replace_using(new_tree)

        # convert to frozen trees
        for doc in all_versions:
            all_versions[doc] = FrozenNode.from_node(all_versions[doc])

        # now build diffs - include "empty" diffs comparing a version to itself
        for lhs_version, lhs_tree in all_versions.iteritems():
            for rhs_version, rhs_tree in all_versions.iteritems():
                changes = checkpointer.checkpoint(
                    "-".join(["diff", lhs_version, rhs_version]),
                    lambda: dict(changes_between(lhs_tree, rhs_tree)))
                builder.writer.diff(
                    reg_tree.label_id(), lhs_version, rhs_version
                ).write(changes)
コード例 #20
0
 def test_whitespace_comparison(self):
     """We shouldn't trigger diffs for whitespace changes"""
     lhs = FrozenNode(u"Some\t\nthing", label=['123'])
     rhs = lhs.clone(text=u"Some\u2009 thing")   # thin-space
     self.assertEqual(difftree.changes_between(lhs, rhs), [])