Exemple #1
0
    def test_map_data(self):
        """
        Map a particular field of every document node with a arbitrary function f
        `map_values` should be an immutable operation
        """
        it = [
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
            "[[Article]] Article II",
            "This is article II text"
        ]

        descriptor = {
            'components': ['Chapter', 'Article'],
            'patterns': ['Chapter', 'Article']
        }

        doc = parse_iterable(it, descriptor)
        doc_level = [n['level'] for _, n in doc.traverse()]

        def f(data):
            data['level'] = 100

        new_doc = map_values(doc, f)

        result = [n['level'] for _, n in new_doc.traverse()]
        expected = [100] * len(result)

        self.assertListEqual(result, expected)

        self.assertListEqual(doc_level, [n['level'] for _, n in doc.traverse()])
Exemple #2
0
    def test_hierarchy_jumps(self):
        """
        Jumps in hierarchy should be allowed
        """
        it = [
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
        ]

        descriptor = {
            'components': ['Chapter', 'Section', 'Sub-section', 'Article'],
            'patterns': ['Chapter', 'Section', 'Sub-section', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(doc.graph.nodes(), key=identifier)

        expected = [
            "ROOT [0]",
            "Chapter [1]",
            "Article [2]",
        ]

        self.assertListEqual(reading_order, expected)
Exemple #3
0
    def test_post_build_processing_remove_occurrences(self):
        """
        Given an iterable content plus a hierarchy descriptor, we should be able to build a graph with
        that captures the content structure
        """
        it = [
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
        ]

        descriptor = {
            'components': ['Chapter', 'Article'],
            'patterns': ['Chapter', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        descriptor = extend_internal_patterns(descriptor)
        descriptor = compile_patterns(descriptor)

        doc = post_build_process(doc, descriptor)

        result = [n['text'] for _, n in doc.traverse()]
        expected = [[], ["Chapter I", "This is chapter I text"],
                    ["Article I", "This is article I text"]]
        self.assertListEqual(result, expected)
    def test_graph_is_correctly_build(self):
        """
        Given an iterable content plus a hierarchy descriptor, we should be able to build a graph with
        that captures the content structure
        """
        descriptor = {
            'components': ['Schedule', 'Part', 'Section', 'Point', 'Subpoint'],
            'patterns': [
                r'^Schedule\s\d{1,2}', r'^PART\s\d{1,2}', r'^\d{1,2}\.\s',
                r'^\d{1,2}\.\d{1,2}\.\s', r'^\d{1,2}\.\d{1,2}\.\d{1,2}\s'
            ]
        }

        doc = parse_iterable(self.text, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(doc.graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]", "Schedule 1 [1]", "PART 1 [2]", "1. [3]", "1.1. [4]",
            "1.2. [5]", "1.2.1 [6]", "1.2.2 [7]", "1.3. [8]", "2. [9]",
            "2.1. [10]", "2.1.1 [11]", "2.1.2 [12]"
        ])
Exemple #5
0
    def test_parse_iterable(self):
        """
        Given an iterable content plus a hierarchy descriptor, we should be able to build a graph with
        that captures the content structure
        """
        it = [
            "Schedule 1 - First Part",
            " -- 100",
            "PPL – FInal",
            "PART 1- INVESTMENT RESTRICTIONS",
            "The Manager must pay due regard to Applicable Laws and Regulations",
            "Derivatives will be used only in a manner consistent with the usage restrictions",
            "1. General Restrictions",
            "1.1. The basic restrictions applicable to the Assets specified in the PRA",
            "1.2. The Client has determined that such basic restrictions should be supplemented by:",
            "1.2.1 the general terms contained within this Agreement; and",
            "1.2.2 the terms agreed in previous regulations",
            "1.3. There shall be no negative currency exposures specified in paragraph 2.2.1",
            "2. Derivative Restrictions",
            "2.1. Specific restrictions",
            "2.1.1 Permitted exchanges and contracts",
            "Instruments must be listed(1) or with an approved counterparty(1)",
            "2.1.2 Counterparty Restrictions",
            "Forward Currency Contracts are restricted to those transacted with banks",
        ]

        descriptor = {
            'components': ['Schedule', 'Part', 'Section', 'Point', 'Subpoint'],
            'patterns': [
                r'^Schedule\s\d{1,2}', r'^PART\s\d{1,2}', r'^\d{1,2}\.\s',
                r'^\d{1,2}\.\d{1,2}\.\s', r'^\d{1,2}\.\d{1,2}\.\d{1,2}\s'
            ]
        }

        doc = parse_iterable(it, descriptor)

        def identifier(x):
            reg = re.compile(r'\[(\d+\_?(\d+)?)[a-z]?\]')
            return int(reg.search(x).groups(0)[0])

        reading_order = sorted(doc.graph.nodes(), key=identifier)

        self.assertListEqual(reading_order, [
            "ROOT [0]", "Schedule 1 [1]", "PART 1 [2]", "1. [3]", "1.1. [4]",
            "1.2. [5]", "1.2.1 [6]", "1.2.2 [7]", "1.3. [8]", "2. [9]",
            "2.1. [10]", "2.1.1 [11]", "2.1.2 [12]"
        ])
Exemple #6
0
    def test_custom_ids(self):
        """
        Some sources are already structured and contain node IDs
        that map back to valid URIs

        In these cases we should be able to use these instead of
        creating new ones internally
        """
        it = [
            "[[Chapter]]{'id': '/base/chapter/1'} Chapter I",
            "This is chapter I text",
            "[[Article]]{'id': '/base/article/1'} Article I",
            "This is article I text",
        ]

        descriptor = {
            'components': ['Chapter', 'Section', 'Sub-section', 'Article'],
            'patterns': ['Chapter', 'Section', 'Sub-section', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        result = [n for n in doc.graph.nodes(data=True)]

        expected = [('ROOT [0]', {
            'meta': 'root',
            'level': 0,
            'text': [],
            'pad': False,
            'id': '/root'
        }),
                    ('Chapter [1]', {
                        'meta': 'Chapter',
                        'level': 1,
                        'pad': False,
                        'text': ["Chapter I", 'This is chapter I text'],
                        'id': '/base/chapter/1'
                    }),
                    ('Article [2]', {
                        'meta': 'Article',
                        'level': 4,
                        'pad': False,
                        'text': ["Article I", 'This is article I text'],
                        'id': '/base/article/1'
                    })]

        self.assertListEqual(result, expected)
Exemple #7
0
def parse_article(driver: WebDriver, url: str):
    """
    Given an article, parse its content into a
    representation that preserves the structure

    * Grab the HTML page
    * Crawl its contents
    * Tag the text with the different hierarchical components
    * Parse the resulting output into a graph
    * Enrich with metadata:
        - author information: name, url, etc;
        - document title;
        - publishing timestamp;
        - other metadata;
    """
    logger.info(f"Parsing article '{url}'")

    html = api.get(driver, url,  make_headers(source='seekingalpha'), wait_for=2)
    soup = BeautifulSoup(html)
    logger.debug(f"Soup length '{len(soup)}'")

    hierarchy = ['Article', 'Section', 'Paragraph']

    descriptor = {
        'components': hierarchy,
        'patterns': hierarchy
    }

    text = crawl_article(soup)
    logger.info(f"Text crawled. Number of lines '{len(text)}'")

    logger.info(f"Creating a graph")
    doc = parse_iterable(text, descriptor)
    doc = doc.to_dict()

    doc['url'] = url
    doc['title'] = crawl_title(soup)
    doc['author'] = crawl_author(driver, soup)
    doc['timestamp'] = crawl_timestamp(soup)
    doc['symbols'] = crawl_ticker_symbols(soup)

    # TODO: meta, e.g likes and comments are still with problems
    doc['meta'] = crawl_metadata(soup)

    return doc
Exemple #8
0
    def test_inject_arbitrary_metadata(self):
        it = [
            "[[Chapter]]{'id': '/base/chapter/1', 'p_number': 1} Chapter I",
            "This is chapter I text",
            "[[Article]]{'id': '/base/article/1'} Article I",
            "This is article I text",
        ]

        descriptor = {
            'components': ['Chapter', 'Section', 'Sub-section', 'Article'],
            'patterns': ['Chapter', 'Section', 'Sub-section', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        result = [n for n in doc.graph.nodes(data=True)]

        expected = [('ROOT [0]', {
            'meta': 'root',
            'level': 0,
            'text': [],
            'pad': False,
            'id': '/root'
        }),
                    ('Chapter [1]', {
                        'meta': 'Chapter',
                        'level': 1,
                        'pad': False,
                        'text': ["Chapter I", 'This is chapter I text'],
                        'id': '/base/chapter/1',
                        'p_number': 1
                    }),
                    ('Article [2]', {
                        'meta': 'Article',
                        'level': 4,
                        'pad': False,
                        'text': ["Article I", 'This is article I text'],
                        'id': '/base/article/1'
                    })]

        self.assertListEqual(result, expected)
Exemple #9
0
    def test_simple_parsing(self):
        it = [
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
        ]

        descriptor = {
            'components': ['Chapter', 'Section', 'Sub-section', 'Article'],
            'patterns': ['Chapter', 'Section', 'Sub-section', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        result = [n for n in doc.graph.nodes(data=True)]

        expected = [('ROOT [0]', {
            'meta': 'root',
            'level': 0,
            'text': [],
            'pad': False,
            'id': '/root'
        }),
                    ('Chapter [1]', {
                        'meta': 'Chapter',
                        'level': 1,
                        'pad': False,
                        'text': ["Chapter I", 'This is chapter I text'],
                        'id': '/root/chapter-1'
                    }),
                    ('Article [2]', {
                        'meta': 'Article',
                        'level': 4,
                        'pad': False,
                        'text': ["Article I", 'This is article I text'],
                        'id': '/root/chapter-1/article-2'
                    })]

        self.assertListEqual(result, expected)
Exemple #10
0
    def test_copy(self):
        it = [
            "[[C]] Chapter I",
            "This is chapter I text",
            "[[A]] Article I",
            "This is article I text",
            "[[A]] Article II",
            "This is article II text"
        ]

        descriptor = {
            'components': ['Chapter', 'Article'],
            'patterns': ['C', 'A']
        }

        doc = parse_iterable(it, descriptor)

        for key, n in doc.traverse():
            n['level'] = 0

        new_doc = copy(doc)

        for key, n in new_doc.traverse():
            n['level'] = 1

        n = len(list(doc.traverse()))

        self.assertListEqual(
            [n['level'] for i, n in doc.traverse()],
            [0] * n
        )

        self.assertListEqual(
            [n['level'] for i, n in new_doc.traverse()],
            [1] * n
        )
Exemple #11
0
    def test_map_data_example_map_text(self):
        """
        Return a new document with the text field processed
        """
        it = [
            "Schedule 1 - First Part",
            " -- 100",
            "PPL – Final",
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
            "[[Article]] Article II",
            "This is article II text"
        ]

        descriptor = {
            'components': ['Schedule', 'Chapter', 'Article'],
            'patterns': [r'^Schedule\s\d{1,2}', 'Chapter', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        descriptor['exclude'] = [
            re.compile('\\[\\[^Schedule\\s\\d{1,2}\\]\\]'),
            re.compile('\\[\\[Chapter\\]\\]\s',),
            re.compile('\\[\\[Article\\]\\]\s')
        ]

        def remove_occurrences(data):
            data['content'] = [
                reduce(lambda acc, x: x.sub('', acc), descriptor['exclude'], line)
                for line in data['text']
            ]
            return data

        new_doc = map_values(doc, remove_occurrences)

        result = [n['content'] for _, n in new_doc.traverse()]

        expected = [
            [],
            [
                "Schedule 1 - First Part",
                " -- 100",
                "PPL – Final",
            ],
            [
                "Chapter I",
                "This is chapter I text",
            ],
            [
                "Article I",
                "This is article I text",
            ],
            [
                "Article II",
                "This is article II text"
            ]

        ]

        self.assertListEqual(result, expected)
Exemple #12
0
    def test_document_with_gaps(self):
        it = [
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
            "[[Article]] Article II",
            "This is article II text",
            "[[Chapter]] Chapter II",
            "This is chapter II text",
            "[[Article]] Article I",
            "This is article I text",
            "[[Schedule]] Schedule I",
            "This is schedule I text",
            "[[Chapter]] Chapter I",
            "This is chapter I text",
            "[[Article]] Article I",
            "This is article I text",
        ]

        descriptor = {
            'components':
            ['Schedule', 'Chapter', 'Section', 'Sub-section', 'Article'],
            'patterns':
            ['Schedule', 'Chapter', 'Section', 'Sub-section', 'Article']
        }

        doc = parse_iterable(it, descriptor)

        result = [n for n in doc.graph.nodes(data=True)]

        expected = [('ROOT [0]', {
            'meta': 'root',
            'level': 0,
            'text': [],
            'pad': False,
            'id': '/root'
        }),
                    ('Chapter [1]', {
                        'meta': 'Chapter',
                        'level': 2,
                        'pad': False,
                        'text': ["Chapter I", 'This is chapter I text'],
                        'id': '/root/chapter-1'
                    }),
                    ('Article [2]', {
                        'meta': 'Article',
                        'level': 5,
                        'pad': False,
                        'text': ["Article I", 'This is article I text'],
                        'id': '/root/chapter-1/article-2'
                    }),
                    ('Article [3]', {
                        'meta': 'Article',
                        'level': 5,
                        'pad': False,
                        'text': ["Article II", 'This is article II text'],
                        'id': '/root/chapter-1/article-3'
                    }),
                    ('Chapter [4]', {
                        'meta': 'Chapter',
                        'level': 2,
                        'pad': False,
                        'text': ["Chapter II", 'This is chapter II text'],
                        'id': '/root/chapter-4'
                    }),
                    ('Article [5]', {
                        'meta': 'Article',
                        'level': 5,
                        'pad': False,
                        'text': ["Article I", 'This is article I text'],
                        'id': '/root/chapter-4/article-5'
                    }),
                    ('Schedule [6]', {
                        'meta': 'Schedule',
                        'level': 1,
                        'pad': False,
                        'text': ["Schedule I", 'This is schedule I text'],
                        'id': '/root/schedule-6'
                    }),
                    ('Chapter [7]', {
                        'meta': 'Chapter',
                        'level': 2,
                        'pad': False,
                        'text': ["Chapter I", 'This is chapter I text'],
                        'id': '/root/schedule-6/chapter-7'
                    }),
                    ('Article [8]', {
                        'meta': 'Article',
                        'level': 5,
                        'pad': False,
                        'text': ["Article I", 'This is article I text'],
                        'id': '/root/schedule-6/chapter-7/article-8'
                    })]

        self.assertListEqual(result, expected)