Python compile_patterns Beispiele, graphify.descriptor.utils.compile_patterns Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_build_graph.py Projekt: raufer/graphify

    def test_handle_match_next_in_the_hierarchy_requiring_padding(self):
        """
        When traversing the input, if a new match is encountered, it should be properly accommodated on the graph.
        If the new level does not immediately follows the next on the hierarchy, and if padding is set to True
        then we need to pad additional nodes
        """
        match = re.compile(r'Test Match').search('This is a Test Match sentence')

        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D'],
            'padding': True
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        inserted = handle_match(graph, match, 3, descriptor)

        self.assertEqual(inserted, "Test Match [3]")
        self.assertListEqual(sorted(graph.nodes()), sorted(["ROOT [0]", "Test Match [3]", "A [1]", "B [2]"]))
        self.assertDictEqual(graph["A [1]"], {'pad': 1, 'meta': 'A', 'level': 1, 'content': []})
        self.assertDictEqual(graph["B [2]"], {'pad': 1, 'meta': 'B', 'level': 2, 'content': []})
        self.assertDictEqual(graph["Test Match [3]"], {'pad': 0, 'meta': 'Test Match', 'level': 3, 'content': []})

Beispiel #2

0

Datei anzeigen

Datei: test_build_graph.py Projekt: raufer/graphify

    def test_handle_match_higher_on_the_hierarchy(self):
        """
        When traversing the input, if a new match is encountered, it should be properly accommodated on the graph.
        If the new level precedes the current one, we should find the appropriate parent to accommodate this new node
        and properly insert it
        """
        match = re.compile(r'Test Match').search('This is a Test Match sentence')

        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D'],
            'padding': False
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        _ = handle_match(graph, match, 1, descriptor)

        _ = handle_match(graph, match, 2, descriptor)

        inserted = handle_match(graph, match, 1, descriptor)

        self.assertEqual(inserted, "Test Match [3]")
        self.assertListEqual(sorted(graph.nodes()), sorted(["ROOT [0]", "Test Match [1]", "Test Match [2]", "Test Match [3]"]))
        self.assertDictEqual(graph["Test Match [3]"], {'pad': 0, 'meta': 'Test Match', 'level': 1, 'content': []})

        self.assertEqual(sorted([r for _, r in graph.edges("ROOT [0]")]), sorted(['Test Match [3]', 'Test Match [1]']))

Beispiel #3

0

Datei anzeigen

Datei: test_parsing.py Projekt: raufer/graphify

    def test_post_build_processing_remove_occurrences(self):
        """
        Given an iterable content plus a hierarchy descriptor, we should be able to build a graph with
        that captures the content structure
        """
        it = [
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
        ]

        descriptor = {
            'components': ['Chapter', 'Article'],
            'patterns': ['Chapter', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        descriptor = extend_internal_patterns(descriptor)
        descriptor = compile_patterns(descriptor)

        doc = post_build_process(doc, descriptor)

        result = [n['text'] for _, n in doc.traverse()]
        expected = [[], ["Chapter I", "This is chapter I text"],
                    ["Article I", "This is article I text"]]
        self.assertListEqual(result, expected)

Beispiel #4

0

Datei anzeigen

    def test_search_descriptor_patterns_different_components_same_hierarchy(self):
        descriptor = {
            'components': [['A', 'Z'], 'B', 'C'],
            'patterns': [[r'A', r'Z'], r'B', r'C']
        }

        descriptor = compile_patterns(descriptor)
        descriptor = normalize_descriptor(descriptor)

        self.assertEqual(search_descriptor_patterns('A', descriptor)[1], 1)
        self.assertEqual(search_descriptor_patterns('Z', descriptor)[1], 1)
        self.assertEqual(search_descriptor_patterns('B', descriptor)[1], 2)
        self.assertEqual(search_descriptor_patterns('C', descriptor)[1], 3)
        self.assertEqual(search_descriptor_patterns('X', descriptor)[1], None)

Beispiel #5

0

Datei anzeigen

Datei: test_descriptor_utils.py Projekt: raufer/graphify

    def test_pattern_compiling_with_flags(self):
        """
        Given a descriptor configuration object the patterns can be regex objects if the user need a higher flexibility
        """
        descriptor = {
            'components': ['A', 'B', 'C'],
            'patterns': [re.compile(r'A'),
                         re.compile(r'B'),
                         re.compile(r'C')]
        }

        compiled_descriptor = compile_patterns(descriptor)

        self.assertDictEqual(descriptor, compiled_descriptor)

Beispiel #6

0

Datei anzeigen

    def test_search_descriptor_patterns(self):
        """
        Given a descriptor configuration object every 'pattern' should be compile
        """
        descriptor = {
            'components': ['A', 'B', 'C'],
            'patterns': [r'A', r'B', r'C']
        }

        descriptor = compile_patterns(descriptor)
        descriptor = normalize_descriptor(descriptor)

        self.assertEqual(search_descriptor_patterns('A', descriptor)[1], 1)
        self.assertEqual(search_descriptor_patterns('B', descriptor)[1], 2)
        self.assertEqual(search_descriptor_patterns('C', descriptor)[1], 3)
        self.assertEqual(search_descriptor_patterns('X', descriptor)[1], None)

Beispiel #7

0

Datei anzeigen

Datei: test_descriptor_utils.py Projekt: raufer/graphify

    def test_pattern_compiling(self):
        """
        Given a descriptor configuration object every 'pattern' should be compile
        """
        descriptor = {
            'components': ['A', 'B', 'C'],
            'patterns': [r'A', r'B', r'C']
        }

        compiled_descriptor = compile_patterns(descriptor)

        self.assertListEqual(compiled_descriptor['components'],
                             ['A', 'B', 'C'])

        for pattern in compiled_descriptor['patterns']:
            self.assertEqual(str(pattern.__class__), "<class 're.Pattern'>")

Beispiel #8

0

Datei anzeigen

Datei: __init__.py Projekt: raufer/graphify

def parse_iterable(it: Iterable[str],
                   descriptor: Dict,
                   name: str = 'ROOT') -> Document:
    """
    Given a descriptor that describes the hierarchical structure of an iterable
    parse it into a graph representation

    'name' is the document name without spaces
    """
    descriptor = extend_internal_patterns(descriptor)
    descriptor = extend_descriptor_with_data_capture_group(descriptor)
    descriptor = compile_patterns(descriptor)

    graph = build(it, descriptor, name)
    document = Document(graph, "{} [0]".format(name))

    document = post_build_process(document, descriptor)
    return document

Beispiel #9

0

Datei anzeigen

Datei: test_content_traverse.py Projekt: raufer/graphify

    def test_stop_marker(self):
        """
        We should be able to provide a stopping pattern at the descriptor
        """
        descriptor = {
            'components': ['Section', 'Subsection'],
            'patterns': [r'^\d{1,2}[A-Z]?\.?\s', r'^\d{1,2}[A-Z]?\.\d{1,2}\s'],
            'stopParsing': r'^Appoint'
        }

        descriptor = compile_patterns(descriptor)

        graph = build(self.text, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]", "1. [1]", "1.1 [2]", "2. [3]", "2A. [4]", "2A.1 [5]",
            "3B [6]", "3B.1 [7]"
        ])

        self.assertDictEqual(
            graph.node["2. [3]"],
            {
                'content': ['2. MANAGEMENT'],
                'level': 1,
                'meta': '2.',
                'pad': 0
            },
        )

        self.assertDictEqual(
            graph.node["3B [6]"], {
                'meta': '3B',
                'level': 1,
                'pad': 0,
                'content': ['3B CUSTODY AND REGISTRATION']
            })

Beispiel #10

0

Datei anzeigen

Datei: test_build_graph.py Projekt: raufer/graphify

    def test_handle_match_next_in_the_hierarchy(self):
        """
        When traversing the input, if a new match is encountered, it should be properly accommodated on the graph.
        In the most simple case, we detect a node that follows next on the hierarchy
        """
        match = re.compile(r'Test Match').search('This is a Test Match sentence')

        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D'],
            'padding': True
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        inserted = handle_match(graph, match, 1, descriptor)

        self.assertEqual(inserted, "Test Match [1]")
        self.assertListEqual(sorted(graph.nodes()), sorted(["ROOT [0]", "Test Match [1]"]))
        self.assertDictEqual(graph["Test Match [1]"], {'pad': 0, 'meta': 'Test Match', 'level': 1, 'content': []})

Beispiel #11

0

Datei anzeigen

Datei: test_build_graph.py Projekt: raufer/graphify

    def test_padding_01(self):
        """
        When requested, the insertion of a new node must be preceded of a padding process which ensures an uniform and predictable hierarchy
        """
        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D']
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        data = {'level': 1, 'meta': 'ART'}
        key = 'NEW NODE'
        parent = "ROOT [0]"
        node = _add_node(graph, key, parent, **data)

        last_node = _pad(graph, node, data['level'] + 1, 4, descriptor)

        self.assertListEqual(sorted(graph.nodes()), sorted(['ROOT [0]', 'NEW NODE [1]', 'B [2]', 'C [3]']))

        self.assertEqual(last_node, 'C [3]')

Beispiel #12

0

Datei anzeigen

Datei: test_build_graph.py Projekt: raufer/graphify

    def test_padding_no_effect(self):
        """
        If the requested level is just one below the hierarchy the padding process should leave the graph unchanged
        """
        descriptor = {
            'components': ['A', 'B', 'C', 'D'],
            'patterns': [r'A', r'B', r'C', r'D']
        }

        descriptor = compile_patterns(descriptor)

        graph = initialize_backbone(NetworkxImplementation())

        data = {'level': 1, 'meta': 'ART'}
        key = 'NEW NODE'
        parent = "ROOT [0]"
        node = _add_node(graph, key, parent, **data)

        nodes_before = sorted(graph.nodes())
        last_node = _pad(graph, node, 3 + 1, 4, descriptor)
        nodes_after = sorted(graph.nodes())

        self.assertEqual(last_node, node)
        self.assertListEqual(nodes_before, nodes_after)

Beispiel #13

0

Datei anzeigen

Datei: test_content_traverse.py Projekt: raufer/graphify

    def test_graph_is_correctly_build(self):
        """
        Given an iterable content plus a hierarchy descriptor, we should be able to build a graph with
        that captures the content structure
        """
        descriptor = {
            'components': ['Section', 'Subsection'],
            'patterns': [r'^\d{1,2}[A-Z]?\.?\s', r'^\d{1,2}[A-Z]?\.\d{1,2}\s']
        }

        descriptor = compile_patterns(descriptor)

        graph = build(self.text, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]",
            "1. [1]",
            "1.1 [2]",
            "2. [3]",
            "2A. [4]",
            "2A.1 [5]",
            "3B [6]",
            "3B.1 [7]",
            "4. [8]",
            "4.1 [9]",
            "4.2 [10]",
            "4.4 [11]",
        ])

        self.assertDictEqual(
            graph.node["2. [3]"],
            {
                'content': ['2. MANAGEMENT'],
                'level': 1,
                'meta': '2.',
                'pad': 0
            },
        )

        self.assertDictEqual(
            graph.node["3B [6]"], {
                'meta': '3B',
                'level': 1,
                'pad': 0,
                'content': ['3B CUSTODY AND REGISTRATION']
            })

        self.assertDictEqual(
            graph.node["4.2 [10]"], {
                'level':
                2,
                'pad':
                0,
                'content': [
                    '4.2 Without prejudice to Clause 3.3, the Manager shall ensure that prompt notice of all dealings',
                    '4.3. Subject to PRA and/or FCA Rules, the Manager may aggregate transactions'
                ],
                'meta':
                '4.2'
            })

Beispiel #14

0

Datei anzeigen

Datei: test_content_traverse.py Projekt: raufer/graphify

    def test_start_parsing(self):
        """
        We should be able to provide a starting pattern to point out the start of the relevant content
        """
        descriptor = {
            'components': ['Section', 'Subsection'],
            'patterns': [r'^\d{1,2}[A-Z]?\.?\s', r'^\d{1,2}[A-Z]?\.\d{1,2}\s'],
            'startParsing': r'the provisions of any'
        }

        descriptor = compile_patterns(descriptor)

        graph = build(self.text, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]",
            "2. [1]",
            "2A. [2]",
            "2A.1 [3]",
            "3B [4]",
            "3B.1 [5]",
            "4. [6]",
            "4.1 [7]",
            "4.2 [8]",
            "4.4 [9]",
        ])

        self.assertDictEqual(
            graph.node["2. [1]"],
            {
                'content': ['2. MANAGEMENT'],
                'level': 1,
                'meta': '2.',
                'pad': 0
            },
        )

        self.assertDictEqual(
            graph.node["3B [4]"], {
                'meta': '3B',
                'level': 1,
                'pad': 0,
                'content': ['3B CUSTODY AND REGISTRATION']
            })

        self.assertDictEqual(
            graph.node["4.2 [8]"], {
                'level':
                2,
                'pad':
                0,
                'content': [
                    '4.2 Without prejudice to Clause 3.3, the Manager shall ensure that prompt notice of all dealings',
                    '4.3. Subject to PRA and/or FCA Rules, the Manager may aggregate transactions'
                ],
                'meta':
                '4.2'
            })

Beispiel #15

0

Datei anzeigen

Datei: __init__.py Projekt: raufer/graphify

 def __init__(self, descriptor):
     self.descriptor = extend_internal_patterns(descriptor)
     self.descriptor = compile_patterns(descriptor)