コード例 #1
0
def doc_setup():
    """Set up a document."""
    parser_udf = get_parser_udf()

    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = """<html>
                    <body>
                        <h1>test1</h1>
                        <h2>test2</h2>
                        <div>
                            <h3>test3</h3>
                            <table>
                                <tr>
                                    <td>test4</td>
                                    <td>test5</td>
                                </tr>
                            </table>
                            <table>
                                <tr>
                                    <td>test6</td>
                                    <td>test7</td>
                                </tr>
                            </table>
                        </div>
                        <p>test8 test9</p>
                    </body>
                </html>"""
    doc = parser_udf.apply(doc)

    return doc
コード例 #2
0
def doc_setup():
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple"
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    return doc
コード例 #3
0
def test_parser_skips_and_flattens(caplog):
    """Test if ``Parser`` skips/flattens elements."""
    caplog.set_level(logging.INFO)

    parser_udf = get_parser_udf()

    # Test if a parser skips comments
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "<html><body>Hello!<!-- comment --></body></html>"
    for _ in parser_udf.apply(doc):
        pass
    assert doc.sentences[0].text == "Hello!"

    # Test if a parser skips blacklisted elements
    doc = Document(id=2, name="test2", stable_id="2::document:0:0")
    doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>"
    for _ in parser_udf.apply(doc):
        pass
    assert doc.sentences[0].text == "Hello!"

    # Test if a parser flattens elements
    doc = Document(id=3, name="test3", stable_id="3::document:0:0")
    doc.text = "<html><body><span>Hello, <br>world!</span></body></html>"
    for _ in parser_udf.apply(doc):
        pass
    assert doc.sentences[0].text == "Hello, world!"

    # Now with different blacklist and flatten
    parser_udf = get_parser_udf(blacklist=["meta"], flatten=["word"])

    # Test if a parser does not skip non-blacklisted element
    doc = Document(id=4, name="test4", stable_id="4::document:0:0")
    doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>"
    for _ in parser_udf.apply(doc):
        pass
    assert doc.sentences[0].text == "alert('Hello');"
    assert doc.sentences[1].text == "Hello!"

    # Test if a parser skips blacklisted elements
    doc = Document(id=5, name="test5", stable_id="5::document:0:0")
    doc.text = "<html><head><meta name='keywords'></head><body>Hello!</body></html>"
    for _ in parser_udf.apply(doc):
        pass
    assert doc.sentences[0].text == "Hello!"

    # Test if a parser does not flatten elements
    doc = Document(id=6, name="test6", stable_id="6::document:0:0")
    doc.text = "<html><body><span>Hello, <br>world!</span></body></html>"
    for _ in parser_udf.apply(doc):
        pass
    assert doc.sentences[0].text == "Hello,"
    assert doc.sentences[1].text == "world!"

    # Test if a parser flattens elements
    doc = Document(id=7, name="test7", stable_id="7::document:0:0")
    doc.text = "<html><body><word>Hello, </word><word>world!</word></body></html>"
    for _ in parser_udf.apply(doc):
        pass
    assert doc.sentences[0].text == "Hello, world!"
コード例 #4
0
def doc_setup():
    """Set up document."""
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple. That is orange. Where is banaba? I like Apple."
    lingual_parser = SpacyParser("en")
    # Split sentences
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Enrich sentences
    for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences):
        pass

    # Pick one sentence and add visual information
    # so that all the words get aligned horizontally.
    sentence: Sentence = doc.sentences[0]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 2nd sentence is horizontally aligned with 1st.
    sentence: Sentence = doc.sentences[1]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [40, 50, 60, 70]
    sentence.right = [50, 60, 70, 80]

    # Assume the 3rd sentence is vertically aligned with 1st.
    sentence: Sentence = doc.sentences[2]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [10, 10, 10, 10]
    sentence.bottom = [20, 20, 20, 20]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 4th sentence is in 2nd page.
    sentence: Sentence = doc.sentences[3]
    sentence.page = [2, 2, 2, 2]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    return doc
コード例 #5
0
    def parse(self, document: Document, text: str) -> Iterator[Sentence]:
        """Depth-first search over the provided tree.

        Implemented as an iterative procedure. The structure of the state
        needed to parse each node is also defined in this function.

        :param document: the Document context
        :param text: the structured text of the document (e.g. HTML)
        :return: a *generator* of Sentences.
        """
        stack = []

        root = lxml.html.fromstring(text)

        # flattens children of node that are in the 'flatten' list
        if self.flatten:
            lxml.etree.strip_tags(root, self.flatten)
        # Strip comments
        lxml.etree.strip_tags(root, lxml.etree.Comment)
        # Assign the text, which was stripped of the 'flatten'-tags, to the document
        document.text = lxml.etree.tostring(root, encoding="unicode")

        # This dictionary contain the global state necessary to parse a
        # document and each context element. This reflects the relationships
        # defined in parser/models. This contains the state necessary to create
        # the respective Contexts within the document.
        state = {
            "visited": set(),
            "parent": {},  # map of parent[child] = node used to discover child
            "context":
            {},  # track the Context of each node (context['td'] = Cell)
            "root": root,
            "document": document,
            "section": {
                "idx": 0
            },
            "paragraph": {
                "idx": 0
            },
            "figure": {
                "idx": 0
            },
            "caption": {
                "idx": 0
            },
            "table": {
                "idx": 0
            },
            "sentence": {
                "idx": 0,
                "abs_offset": 0
            },
        }
        # NOTE: Currently the helper functions directly manipulate the state
        # rather than returning a modified copy.

        # Iterative Depth-First Search
        stack.append(root)
        state["parent"][root] = document
        state["context"][root] = document

        tokenized_sentences: List[Sentence] = []
        while stack:
            node = stack.pop()
            if node not in state["visited"]:
                state["visited"].add(node)  # mark as visited

                # Process
                if self.lingual:
                    tokenized_sentences += [
                        y for y in self._parse_node(node, state)
                    ]
                else:
                    yield from self._parse_node(node, state)

                # NOTE: This reversed() order is to ensure that the iterative
                # DFS matches the order that would be produced by a recursive
                # DFS implementation.
                for child in reversed(node):
                    # Skip nodes that are blacklisted
                    if self.blacklist and child.tag in self.blacklist:
                        continue

                    stack.append(child)

                    # store the parent of the node, which is either the parent
                    # Context, or if the parent did not create a Context, then
                    # use the node's parent Context.
                    state["parent"][child] = (state["context"][node]
                                              if node in state["context"] else
                                              state["parent"][node])

        if self.lingual:
            yield from self.lingual_parser.enrich_sentences_with_NLP(
                tokenized_sentences)
コード例 #6
0
def test_ner_matchers():
    """Test different ner type matchers."""
    # Set up a document
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = " ".join([
        "Tim Cook was born in USA in 1960.",
        "He is the CEO of Apple.",
        "He sold 100 million of iPhone.",
    ])
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Manually attach ner_tags as the result from spacy may fluctuate.
    doc.sentences[0].ner_tags = [
        "PERSON",
        "PERSON",
        "O",
        "O",
        "O",
        "GPE",
        "O",
        "DATE",
        "O",
    ]
    doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"]
    doc.sentences[2].ner_tags = [
        "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O"
    ]

    # the length of words and that of ner_tags should match.
    assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags)
    assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags)

    space = MentionNgrams(n_min=1, n_max=2)

    # Test if PersonMatcher works as expected
    matcher = PersonMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"}

    # Test if LocationMatcher works as expected
    matcher = LocationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"USA"}

    # Test if DateMatcher works as expected
    matcher = DateMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"1960"}

    # Test if OrganizationMatcher works as expected
    matcher = OrganizationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Apple"}

    # Test if NumberMatcher works as expected
    matcher = NumberMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"100 million"}

    # Test if MiscMatcher works as expected
    matcher = MiscMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"iPhone"}