def setUp(self):
     """Use a parser like that used for reg text."""
     self.regParser = ParagraphParser(r"\(%s\)", Node.REGTEXT)
Esempio n. 2
0
class DepthParagraphTest(TestCase):
    def setUp(self):
        """Use a parser like that used for reg text."""
        self.regParser = ParagraphParser(r"\({0}\)", Node.REGTEXT)

    def test_matching_subparagraph_ids(self):
        matches = self.regParser.matching_subparagraph_ids(0, 8)    # 'i'
        self.assertEqual(1, len(matches))
        self.assertEqual(2, matches[0][0])
        self.assertEqual(0, matches[0][1])
        matches = self.regParser.matching_subparagraph_ids(1, 3)    # '4'
        self.assertEqual(0, len(matches))

    def test_best_start(self):
        text = "This is my (ii) awesome text with a subparagraph in it."
        begin_match = (0, 0)
        end_match = (len(text), len(text))
        starts = [begin_match, end_match]
        self.assertEqual(end_match,
                         self.regParser.best_start(text, 0, 8, starts))
        self.assertEqual(begin_match,
                         self.regParser.best_start(text, 0, 9, starts))

    def test_find_paragraph_start_match_success(self):
        """Simple label checks."""
        text = "This (a) is (Z) the first (1) section for (2) something\n"
        text += "and then (iii) another thing goes here."
        self.assertEqual((5, 8),
                         self.regParser.find_paragraph_start_match(text, 0, 0))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 0, 1))
        self.assertEqual((26, 29),
                         self.regParser.find_paragraph_start_match(text, 1, 0))
        self.assertEqual((42, 45),
                         self.regParser.find_paragraph_start_match(text, 1, 1))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 1, 2))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 2, 0))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 2, 1))
        self.assertEqual((65, 70),
                         self.regParser.find_paragraph_start_match(text, 2, 2))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 2, 3))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 3, 0))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(text, 3, 25))
        self.assertEqual(
            None, self.regParser.find_paragraph_start_match(text, 3, 26))

    def test_find_paragraph_start_match_excludes(self):
        """Excluded ranges should not be included in results."""
        text = "This (a) is (a) a test (a) section for (a) testing."
        self.assertEqual((5, 8),
                         self.regParser.find_paragraph_start_match(text, 0, 0))
        self.assertEqual(
            (5, 8), self.regParser.find_paragraph_start_match(text, 0, 0, []))
        self.assertEqual(
            (5, 8), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(10, len(text))]))
        self.assertEqual(
            (5, 8), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, 1)]))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, 10)]))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, 1), (4, 9)]))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(5, 5)]))
        self.assertEqual(
            (39, 42), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(5, 7), (10, 25)]))
        self.assertEqual(
            None, self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, len(text))]))

    def test_find_paragraph_start_match_is(self):
        """Test the case where we are looking for paragraph (i) (the
        letter,) but we run into (i) (the roman numeral.)"""
        text1 = "(h) Paragraph (1) H has (i) some (ii) sub (iii) sections but "
        text2 = "(i) this paragraph does not."
        self.assertEqual(
            (len(text1), len(text1)+3),
            self.regParser.find_paragraph_start_match(text1+text2, 0, 8))

    def test_paragraph_offsets_present(self):
        """Test that section_offsets works as expected for good input."""
        text = "This (a) is a good (b) test for (c) something like this."""
        self.assertEqual((5, 19), self.regParser.paragraph_offsets(text, 0, 0))
        self.assertEqual((19, 32),
                         self.regParser.paragraph_offsets(text, 0, 1))
        self.assertEqual((32, len(text)),
                         self.regParser.paragraph_offsets(text, 0, 2))

    def test_paragraph_offsets_not_present(self):
        """Verify we get None when the searched for text isn't there."""
        text = "This (a) is a good (b) test for (c) something like this."""
        self.assertEqual(None, self.regParser.paragraph_offsets(text, 0, 3))
        self.assertEqual(None, self.regParser.paragraph_offsets(text, 1, 0))
        self.assertEqual(None, self.regParser.paragraph_offsets(text, 2, 0))

    def test_paragraphs(self):
        """This method should pull out the relevant paragraphs, as a list"""
        text = "This (a) is a good (1) test (2) of (3) some (b) body."
        ps = self.regParser.paragraphs(text, 0)
        paragraph_strings = [text[s[0]:s[1]] for s in ps]
        self.assertEqual(paragraph_strings,
                         ["(a) is a good (1) test (2) of (3) some ",
                          "(b) body."])

        text = "(a) is a good (1) test (2) of (3) some "
        ps = self.regParser.paragraphs(text, 1)
        paragraph_strings = [text[s[0]:s[1]] for s in ps]
        self.assertEqual(paragraph_strings,
                         ["(1) test ", "(2) of ", "(3) some "])

        ps = self.regParser.paragraphs(text, 2)
        paragraph_strings = [text[s[0]:s[1]] for s in ps]
        self.assertEqual(paragraph_strings, [])

    def test_build_paragraph_tree(self):
        """Verify several paragraph trees."""
        text = "This (a) is a good (1) test (2) of (3) some (b) body."
        self.assertEqual(
            self.regParser.build_tree(text),
            Node("This ", children=[
                Node("(a) is a good ", label=['a'], children=[
                    Node("(1) test ", label=['a', '1']),
                    Node("(2) of ", label=['a', '2']),
                    Node("(3) some ", label=['a', '3'])
                ]),
                Node("(b) body.", label=['b'])
            ])
        )

    def test_build_tree_exclude(self):
        """Paragraph tree should not split on exclude areas."""
        ref = "Ref (b)(2)"
        text = "This (a) is a good (1) {0} test (2) no?".format(ref)
        ref_pos = text.find(ref)
        self.assertEqual(
            self.regParser.build_tree(text,
                                      exclude=[(ref_pos, ref_pos+len(ref))]),
            Node("This ", children=[
                Node("(a) is a good ", label=['a'], children=[
                    Node("(1) {0} test ".format(ref), label=['a', '1']),
                    Node("(2) no?", label=['a', '2'])
                ])
            ])
        )

    def test_build_tree_label_preamble(self):
        """Paragraph tree's labels can be prepended."""
        text = "This (a) is a good (1) test (2) of (3) some (b) body."
        tree = self.regParser.build_tree(text, label=['205', '14'])
        self.assertEqual(["205", "14"], tree.label)
        child_a, child_b = tree.children
        self.assertEqual(["205", "14", "a"], child_a.label)
        child_a_1, child_a_2, child_a_3 = child_a.children
        self.assertEqual(["205", "14", "a", "1"], child_a_1.label)
        self.assertEqual(["205", "14", "a", "2"], child_a_2.label)
        self.assertEqual(["205", "14", "a", "3"], child_a_3.label)
        self.assertEqual(["205", "14", "b"], child_b.label)

    def test_hash_for_paragraph(self):
        """hash_for_paragraph should standardize the given parameter. It
        should also use numbers in a large range -- an arbitrary hash should
        result in a relatively large number"""
        self.assertEqual(hash_for_paragraph('Abc 123 More.'),
                         hash_for_paragraph(' abc123 mOrE'))
        random_term = uuid.uuid4().hex
        self.assertTrue(hash_for_paragraph(random_term) > 10000,
                        msg="Hashed too small: {0}".format(random_term))
class DepthParagraphTest(TestCase):
    def setUp(self):
        """Use a parser like that used for reg text."""
        self.regParser = ParagraphParser(r"\(%s\)", Node.REGTEXT)

    def test_matching_subparagraph_ids(self):
        matches = self.regParser.matching_subparagraph_ids(0, 8)    # 'i'
        self.assertEqual(1, len(matches))
        self.assertEqual(2, matches[0][0])
        self.assertEqual(0, matches[0][1])
        matches = self.regParser.matching_subparagraph_ids(1, 3)    # '4'
        self.assertEqual(0, len(matches))

    def test_best_start(self):
        text = "This is my (ii) awesome text with a subparagraph in it."
        begin_match = (0, 0)
        end_match = (len(text), len(text))
        starts = [begin_match, end_match]
        self.assertEqual(end_match,
                         self.regParser.best_start(text, 0, 8, starts))
        self.assertEqual(begin_match,
                         self.regParser.best_start(text, 0, 9, starts))

    def test_find_paragraph_start_match_success(self):
        """Simple label checks."""
        text = "This (a) is (Z) the first (1) section for (2) something\n"
        text += "and then (iii) another thing goes here."
        self.assertEqual((5, 8),
                         self.regParser.find_paragraph_start_match(text, 0, 0))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 0, 1))
        self.assertEqual((26, 29),
                         self.regParser.find_paragraph_start_match(text, 1, 0))
        self.assertEqual((42, 45),
                         self.regParser.find_paragraph_start_match(text, 1, 1))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 1, 2))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 2, 0))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 2, 1))
        self.assertEqual((65, 70),
                         self.regParser.find_paragraph_start_match(text, 2, 2))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 2, 3))
        self.assertEqual(None,
                         self.regParser.find_paragraph_start_match(text, 3, 0))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(text, 3, 25))
        self.assertEqual(
            None, self.regParser.find_paragraph_start_match(text, 3, 26))

    def test_find_paragraph_start_match_excludes(self):
        """Excluded ranges should not be included in results."""
        text = "This (a) is (a) a test (a) section for (a) testing."
        self.assertEqual((5, 8),
                         self.regParser.find_paragraph_start_match(text, 0, 0))
        self.assertEqual(
            (5, 8), self.regParser.find_paragraph_start_match(text, 0, 0, []))
        self.assertEqual(
            (5, 8), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(10, len(text))]))
        self.assertEqual(
            (5, 8), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, 1)]))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, 10)]))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, 1), (4, 9)]))
        self.assertEqual(
            (12, 15), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(5, 5)]))
        self.assertEqual(
            (39, 42), self.regParser.find_paragraph_start_match(
                text, 0, 0, [(5, 7), (10, 25)]))
        self.assertEqual(
            None, self.regParser.find_paragraph_start_match(
                text, 0, 0, [(0, len(text))]))

    def test_find_paragraph_start_match_is(self):
        """Test the case where we are looking for paragraph (i) (the
        letter,) but we run into (i) (the roman numeral.)"""
        text1 = "(h) Paragraph (1) H has (i) some (ii) sub (iii) sections but "
        text2 = "(i) this paragraph does not."
        self.assertEqual(
            (len(text1), len(text1)+3),
            self.regParser.find_paragraph_start_match(text1+text2, 0, 8))

    def test_paragraph_offsets_present(self):
        """Test that section_offsets works as expected for good input."""
        text = "This (a) is a good (b) test for (c) something like this."""
        self.assertEqual((5, 19), self.regParser.paragraph_offsets(text, 0, 0))
        self.assertEqual((19, 32),
                         self.regParser.paragraph_offsets(text, 0, 1))
        self.assertEqual((32, len(text)),
                         self.regParser.paragraph_offsets(text, 0, 2))

    def test_paragraph_offsets_not_present(self):
        """Verify we get None when the searched for text isn't there."""
        text = "This (a) is a good (b) test for (c) something like this."""
        self.assertEqual(None, self.regParser.paragraph_offsets(text, 0, 3))
        self.assertEqual(None, self.regParser.paragraph_offsets(text, 1, 0))
        self.assertEqual(None, self.regParser.paragraph_offsets(text, 2, 0))

    def test_paragraphs(self):
        """This method should pull out the relevant paragraphs, as a list"""
        text = "This (a) is a good (1) test (2) of (3) some (b) body."
        ps = self.regParser.paragraphs(text, 0)
        paragraph_strings = [text[s[0]:s[1]] for s in ps]
        self.assertEqual(paragraph_strings,
                         ["(a) is a good (1) test (2) of (3) some ",
                          "(b) body."])

        text = "(a) is a good (1) test (2) of (3) some "
        ps = self.regParser.paragraphs(text, 1)
        paragraph_strings = [text[s[0]:s[1]] for s in ps]
        self.assertEqual(paragraph_strings,
                         ["(1) test ", "(2) of ", "(3) some "])

        ps = self.regParser.paragraphs(text, 2)
        paragraph_strings = [text[s[0]:s[1]] for s in ps]
        self.assertEqual(paragraph_strings, [])

    def test_build_paragraph_tree(self):
        """Verify several paragraph trees."""
        text = "This (a) is a good (1) test (2) of (3) some (b) body."
        self.assertEqual(
            self.regParser.build_tree(text),
            Node("This ", children=[
                Node("(a) is a good ", label=['a'], children=[
                    Node("(1) test ", label=['a', '1']),
                    Node("(2) of ", label=['a', '2']),
                    Node("(3) some ", label=['a', '3'])
                ]),
                Node("(b) body.", label=['b'])
            ])
        )

    def test_build_tree_exclude(self):
        """Paragraph tree should not split on exclude areas."""
        ref = "Ref (b)(2)"
        text = "This (a) is a good (1) %s test (2) no?" % ref
        ref_pos = text.find(ref)
        self.assertEqual(
            self.regParser.build_tree(text,
                                      exclude=[(ref_pos, ref_pos+len(ref))]),
            Node("This ", children=[
                Node("(a) is a good ", label=['a'], children=[
                    Node("(1) %s test " % ref, label=['a', '1']),
                    Node("(2) no?", label=['a', '2'])
                ])
            ])
        )

    def test_build_tree_label_preamble(self):
        """Paragraph tree's labels can be prepended."""
        text = "This (a) is a good (1) test (2) of (3) some (b) body."
        tree = self.regParser.build_tree(text, label=['205', '14'])
        self.assertEqual(["205", "14"], tree.label)
        child_a, child_b = tree.children
        self.assertEqual(["205", "14", "a"], child_a.label)
        child_a_1, child_a_2, child_a_3 = child_a.children
        self.assertEqual(["205", "14", "a", "1"], child_a_1.label)
        self.assertEqual(["205", "14", "a", "2"], child_a_2.label)
        self.assertEqual(["205", "14", "a", "3"], child_a_3.label)
        self.assertEqual(["205", "14", "b"], child_b.label)

    def test_hash_for_paragraph(self):
        """hash_for_paragraph should standardize the given parameter. It
        should also use numbers in a large range -- an arbitrary hash should
        result in a relatively large number"""
        self.assertEqual(hash_for_paragraph('Abc 123 More.'),
                         hash_for_paragraph(' abc123 mOrE'))
        random_term = uuid.uuid4().hex
        self.assertTrue(hash_for_paragraph(random_term) > 10000,
                        msg="Hashed too small: {}".format(random_term))
Esempio n. 4
0
        for start, end in subpart_locations:
            subpart_body = body[start:end]
            subpart, _ = build_subparts_tree(
                subpart_body, part, lambda p: build_subpart(subpart_body, p))
            subparts_list.append(subpart)
    else:
        emptypart, children_text = build_subparts_tree(body, part,
                                                       build_empty_part)
        if emptypart.children:
            subparts_list.append(emptypart)
        else:
            return struct.Node(text, [build_empty_part(part)], label, title)
    return struct.Node(children_text, subparts_list, label, title)


regParser = ParagraphParser(r"\(%s\)", struct.Node.REGTEXT)


def build_empty_part(part):
    """ When a regulation doesn't have a subpart, we give it an emptypart (a
    dummy subpart) so that the regulation tree is consistent. """

    label = [str(part), 'Subpart']
    return struct.Node('', [], label, '', node_type=struct.Node.EMPTYPART)


def build_subpart(text, part):
    results = marker_subpart_title.parseString(text)
    subpart_letter = results.subpart
    subpart_title = results.subpart_title
    label = [str(part), 'Subpart', subpart_letter]