Beispiel #1
0
    def test_handle_match_next_in_the_hierarchy_requiring_padding(self):
        """
        When traversing the input, if a new match is encountered, it should be properly accommodated on the graph.
        If the new level does not immediately follows the next on the hierarchy, and if padding is set to True
        then we need to pad additional nodes
        """
        match = re.compile(r'Test Match').search('This is a Test Match sentence')

        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D'],
            'padding': True
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        inserted = handle_match(graph, match, 3, descriptor)

        self.assertEqual(inserted, "Test Match [3]")
        self.assertListEqual(sorted(graph.nodes()), sorted(["ROOT [0]", "Test Match [3]", "A [1]", "B [2]"]))
        self.assertDictEqual(graph["A [1]"], {'pad': 1, 'meta': 'A', 'level': 1, 'content': []})
        self.assertDictEqual(graph["B [2]"], {'pad': 1, 'meta': 'B', 'level': 2, 'content': []})
        self.assertDictEqual(graph["Test Match [3]"], {'pad': 0, 'meta': 'Test Match', 'level': 3, 'content': []})
Beispiel #2
0
    def test_handle_match_higher_on_the_hierarchy(self):
        """
        When traversing the input, if a new match is encountered, it should be properly accommodated on the graph.
        If the new level precedes the current one, we should find the appropriate parent to accommodate this new node
        and properly insert it
        """
        match = re.compile(r'Test Match').search('This is a Test Match sentence')

        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D'],
            'padding': False
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        _ = handle_match(graph, match, 1, descriptor)

        _ = handle_match(graph, match, 2, descriptor)

        inserted = handle_match(graph, match, 1, descriptor)

        self.assertEqual(inserted, "Test Match [3]")
        self.assertListEqual(sorted(graph.nodes()), sorted(["ROOT [0]", "Test Match [1]", "Test Match [2]", "Test Match [3]"]))
        self.assertDictEqual(graph["Test Match [3]"], {'pad': 0, 'meta': 'Test Match', 'level': 1, 'content': []})

        self.assertEqual(sorted([r for _, r in graph.edges("ROOT [0]")]), sorted(['Test Match [3]', 'Test Match [1]']))
Beispiel #3
0
    def test_post_build_processing_remove_occurrences(self):
        """
        Given an iterable content plus a hierarchy descriptor, we should be able to build a graph with
        that captures the content structure
        """
        it = [
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
        ]

        descriptor = {
            'components': ['Chapter', 'Article'],
            'patterns': ['Chapter', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        descriptor = extend_internal_patterns(descriptor)
        descriptor = compile_patterns(descriptor)

        doc = post_build_process(doc, descriptor)

        result = [n['text'] for _, n in doc.traverse()]
        expected = [[], ["Chapter I", "This is chapter I text"],
                    ["Article I", "This is article I text"]]
        self.assertListEqual(result, expected)
Beispiel #4
0
    def test_search_descriptor_patterns_different_components_same_hierarchy(self):
        descriptor = {
            'components': [['A', 'Z'], 'B', 'C'],
            'patterns': [[r'A', r'Z'], r'B', r'C']
        }

        descriptor = compile_patterns(descriptor)
        descriptor = normalize_descriptor(descriptor)

        self.assertEqual(search_descriptor_patterns('A', descriptor)[1], 1)
        self.assertEqual(search_descriptor_patterns('Z', descriptor)[1], 1)
        self.assertEqual(search_descriptor_patterns('B', descriptor)[1], 2)
        self.assertEqual(search_descriptor_patterns('C', descriptor)[1], 3)
        self.assertEqual(search_descriptor_patterns('X', descriptor)[1], None)
    def test_pattern_compiling_with_flags(self):
        """
        Given a descriptor configuration object the patterns can be regex objects if the user need a higher flexibility
        """
        descriptor = {
            'components': ['A', 'B', 'C'],
            'patterns': [re.compile(r'A'),
                         re.compile(r'B'),
                         re.compile(r'C')]
        }

        compiled_descriptor = compile_patterns(descriptor)

        self.assertDictEqual(descriptor, compiled_descriptor)
Beispiel #6
0
    def test_search_descriptor_patterns(self):
        """
        Given a descriptor configuration object every 'pattern' should be compile
        """
        descriptor = {
            'components': ['A', 'B', 'C'],
            'patterns': [r'A', r'B', r'C']
        }

        descriptor = compile_patterns(descriptor)
        descriptor = normalize_descriptor(descriptor)

        self.assertEqual(search_descriptor_patterns('A', descriptor)[1], 1)
        self.assertEqual(search_descriptor_patterns('B', descriptor)[1], 2)
        self.assertEqual(search_descriptor_patterns('C', descriptor)[1], 3)
        self.assertEqual(search_descriptor_patterns('X', descriptor)[1], None)
    def test_pattern_compiling(self):
        """
        Given a descriptor configuration object every 'pattern' should be compile
        """
        descriptor = {
            'components': ['A', 'B', 'C'],
            'patterns': [r'A', r'B', r'C']
        }

        compiled_descriptor = compile_patterns(descriptor)

        self.assertListEqual(compiled_descriptor['components'],
                             ['A', 'B', 'C'])

        for pattern in compiled_descriptor['patterns']:
            self.assertEqual(str(pattern.__class__), "<class 're.Pattern'>")
Beispiel #8
0
def parse_iterable(it: Iterable[str],
                   descriptor: Dict,
                   name: str = 'ROOT') -> Document:
    """
    Given a descriptor that describes the hierarchical structure of an iterable
    parse it into a graph representation

    'name' is the document name without spaces
    """
    descriptor = extend_internal_patterns(descriptor)
    descriptor = extend_descriptor_with_data_capture_group(descriptor)
    descriptor = compile_patterns(descriptor)

    graph = build(it, descriptor, name)
    document = Document(graph, "{} [0]".format(name))

    document = post_build_process(document, descriptor)
    return document
    def test_stop_marker(self):
        """
        We should be able to provide a stopping pattern at the descriptor
        """
        descriptor = {
            'components': ['Section', 'Subsection'],
            'patterns': [r'^\d{1,2}[A-Z]?\.?\s', r'^\d{1,2}[A-Z]?\.\d{1,2}\s'],
            'stopParsing': r'^Appoint'
        }

        descriptor = compile_patterns(descriptor)

        graph = build(self.text, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]", "1. [1]", "1.1 [2]", "2. [3]", "2A. [4]", "2A.1 [5]",
            "3B [6]", "3B.1 [7]"
        ])

        self.assertDictEqual(
            graph.node["2. [3]"],
            {
                'content': ['2. MANAGEMENT'],
                'level': 1,
                'meta': '2.',
                'pad': 0
            },
        )

        self.assertDictEqual(
            graph.node["3B [6]"], {
                'meta': '3B',
                'level': 1,
                'pad': 0,
                'content': ['3B CUSTODY AND REGISTRATION']
            })
Beispiel #10
0
    def test_handle_match_next_in_the_hierarchy(self):
        """
        When traversing the input, if a new match is encountered, it should be properly accommodated on the graph.
        In the most simple case, we detect a node that follows next on the hierarchy
        """
        match = re.compile(r'Test Match').search('This is a Test Match sentence')

        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D'],
            'padding': True
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        inserted = handle_match(graph, match, 1, descriptor)

        self.assertEqual(inserted, "Test Match [1]")
        self.assertListEqual(sorted(graph.nodes()), sorted(["ROOT [0]", "Test Match [1]"]))
        self.assertDictEqual(graph["Test Match [1]"], {'pad': 0, 'meta': 'Test Match', 'level': 1, 'content': []})
Beispiel #11
0
    def test_padding_01(self):
        """
        When requested, the insertion of a new node must be preceded of a padding process which ensures an uniform and predictable hierarchy
        """
        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D']
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        data = {'level': 1, 'meta': 'ART'}
        key = 'NEW NODE'
        parent = "ROOT [0]"
        node = _add_node(graph, key, parent, **data)

        last_node = _pad(graph, node, data['level'] + 1, 4, descriptor)

        self.assertListEqual(sorted(graph.nodes()), sorted(['ROOT [0]', 'NEW NODE [1]', 'B [2]', 'C [3]']))

        self.assertEqual(last_node, 'C [3]')
Beispiel #12
0
    def test_padding_no_effect(self):
        """
        If the requested level is just one below the hierarchy the padding process should leave the graph unchanged
        """
        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D']
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        data = {'level': 1, 'meta': 'ART'}
        key = 'NEW NODE'
        parent = "ROOT [0]"
        node = _add_node(graph, key, parent, **data)

        nodes_before = sorted(graph.nodes())
        last_node = _pad(graph, node, 3 + 1, 4, descriptor)
        nodes_after = sorted(graph.nodes())

        self.assertEqual(last_node, node)
        self.assertListEqual(nodes_before, nodes_after)
    def test_graph_is_correctly_build(self):
        """
        Given an iterable content plus a hierarchy descriptor, we should be able to build a graph with
        that captures the content structure
        """
        descriptor = {
            'components': ['Section', 'Subsection'],
            'patterns': [r'^\d{1,2}[A-Z]?\.?\s', r'^\d{1,2}[A-Z]?\.\d{1,2}\s']
        }

        descriptor = compile_patterns(descriptor)

        graph = build(self.text, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]",
            "1. [1]",
            "1.1 [2]",
            "2. [3]",
            "2A. [4]",
            "2A.1 [5]",
            "3B [6]",
            "3B.1 [7]",
            "4. [8]",
            "4.1 [9]",
            "4.2 [10]",
            "4.4 [11]",
        ])

        self.assertDictEqual(
            graph.node["2. [3]"],
            {
                'content': ['2. MANAGEMENT'],
                'level': 1,
                'meta': '2.',
                'pad': 0
            },
        )

        self.assertDictEqual(
            graph.node["3B [6]"], {
                'meta': '3B',
                'level': 1,
                'pad': 0,
                'content': ['3B CUSTODY AND REGISTRATION']
            })

        self.assertDictEqual(
            graph.node["4.2 [10]"], {
                'level':
                2,
                'pad':
                0,
                'content': [
                    '4.2 Without prejudice to Clause 3.3, the Manager shall ensure that prompt notice of all dealings',
                    '4.3. Subject to PRA and/or FCA Rules, the Manager may aggregate transactions'
                ],
                'meta':
                '4.2'
            })
    def test_start_parsing(self):
        """
        We should be able to provide a starting pattern to point out the start of the relevant content
        """
        descriptor = {
            'components': ['Section', 'Subsection'],
            'patterns': [r'^\d{1,2}[A-Z]?\.?\s', r'^\d{1,2}[A-Z]?\.\d{1,2}\s'],
            'startParsing': r'the provisions of any'
        }

        descriptor = compile_patterns(descriptor)

        graph = build(self.text, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]",
            "2. [1]",
            "2A. [2]",
            "2A.1 [3]",
            "3B [4]",
            "3B.1 [5]",
            "4. [6]",
            "4.1 [7]",
            "4.2 [8]",
            "4.4 [9]",
        ])

        self.assertDictEqual(
            graph.node["2. [1]"],
            {
                'content': ['2. MANAGEMENT'],
                'level': 1,
                'meta': '2.',
                'pad': 0
            },
        )

        self.assertDictEqual(
            graph.node["3B [4]"], {
                'meta': '3B',
                'level': 1,
                'pad': 0,
                'content': ['3B CUSTODY AND REGISTRATION']
            })

        self.assertDictEqual(
            graph.node["4.2 [8]"], {
                'level':
                2,
                'pad':
                0,
                'content': [
                    '4.2 Without prejudice to Clause 3.3, the Manager shall ensure that prompt notice of all dealings',
                    '4.3. Subject to PRA and/or FCA Rules, the Manager may aggregate transactions'
                ],
                'meta':
                '4.2'
            })
Beispiel #15
0
 def __init__(self, descriptor):
     self.descriptor = extend_internal_patterns(descriptor)
     self.descriptor = compile_patterns(descriptor)