Ejemplo n.º 1
0
def doc_setup():
    """Set up a document."""
    parser_udf = get_parser_udf()

    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = """<html>
                    <body>
                        <h1>test1</h1>
                        <h2>test2</h2>
                        <div>
                            <h3>test3</h3>
                            <table>
                                <tr>
                                    <td>test4</td>
                                    <td>test5</td>
                                </tr>
                            </table>
                            <table>
                                <tr>
                                    <td>test6</td>
                                    <td>test7</td>
                                </tr>
                            </table>
                        </div>
                        <p>test8 test9</p>
                    </body>
                </html>"""
    doc = parser_udf.apply(doc)

    return doc
Ejemplo n.º 2
0
def doc_setup():
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple"
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    return doc
Ejemplo n.º 3
0
    def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
        name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")]
        with codecs.open(fp, encoding=self.encoding) as f:
            reader = csv.reader(f)

            # Load CSV header
            header_names = None
            if self.header:
                header_names = next(reader)

            # Load document per row
            for i, row in enumerate(reader):
                sections = []
                for j, content in enumerate(row):
                    rule = (self.parser_rule[j] if self.parser_rule is not None
                            and j in self.parser_rule else column_constructor)
                    content_header = (header_names[j]
                                      if header_names is not None else None)
                    context = [
                        build_node(t, n, c) for t, n, c in rule(content)
                    ]
                    sections.append(
                        build_node("section", content_header,
                                   "".join(context)))

                text = build_node("doc", None, "".join(sections))
                doc_name = name + ":" + str(i)
                stable_id = self._get_stable_id(doc_name)

                yield Document(
                    name=doc_name,
                    stable_id=stable_id,
                    text=text,
                    meta={"file_name": file_name},
                )
Ejemplo n.º 4
0
def doc_setup():
    """Set up document."""
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple. That is orange. Where is banaba? I like Apple."
    lingual_parser = SpacyParser("en")
    # Split sentences
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Enrich sentences
    for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences):
        pass

    # Pick one sentence and add visual information
    # so that all the words get aligned horizontally.
    sentence: Sentence = doc.sentences[0]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 2nd sentence is horizontally aligned with 1st.
    sentence: Sentence = doc.sentences[1]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [40, 50, 60, 70]
    sentence.right = [50, 60, 70, 80]

    # Assume the 3rd sentence is vertically aligned with 1st.
    sentence: Sentence = doc.sentences[2]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [10, 10, 10, 10]
    sentence.bottom = [20, 20, 20, 20]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 4th sentence is in 2nd page.
    sentence: Sentence = doc.sentences[3]
    sentence.page = [2, 2, 2, 2]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    return doc
Ejemplo n.º 5
0
 def parse_file(self, fp, file_name):
     with codecs.open(fp, encoding=self.encoding) as f:
         name = os.path.basename(fp).rsplit(".", 1)[0]
         stable_id = self.get_stable_id(name)
         doc = Document(
             name=name, stable_id=stable_id, meta={"file_name": file_name}
         )
         yield doc, f.read()
Ejemplo n.º 6
0
 def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
     with codecs.open(fp, encoding=self.encoding) as f:
         name = os.path.basename(fp).rsplit(".", 1)[0]
         stable_id = self._get_stable_id(name)
         text = build_node("doc", None, build_node("text", None, f.read().strip()))
         yield Document(
             name=name, stable_id=stable_id, text=text, meta={"file_name": file_name}
         )
Ejemplo n.º 7
0
 def parse_file(self, fp, file_name):
     with codecs.open(fp, encoding=self.encoding) as tsv:
         for line in tsv:
             (doc_name, doc_text) = line.split("\t")
             stable_id = self.get_stable_id(doc_name)
             doc = Document(
                 name=doc_name, stable_id=stable_id, meta={"file_name": file_name}
             )
             yield doc, doc_text
Ejemplo n.º 8
0
 def parse_file(self, f, file_name):
     for i, doc in enumerate(et.parse(f).xpath(self.doc)):
         doc_id = str(doc.xpath(self.id)[0])
         text = "\n".join(
             [t for t in doc.xpath(self.text) if t is not None])
         meta = {"file_name": str(file_name)}
         if self.keep_xml_tree:
             meta["root"] = et.tostring(doc)
         stable_id = self.get_stable_id(doc_id)
         yield Document(name=doc_id, stable_id=stable_id, meta=meta), text
Ejemplo n.º 9
0
 def parse_file(self, fp, file_name):
     with codecs.open(fp, encoding=self.encoding) as f:
         soup = BeautifulSoup(f, "lxml")
         for text in soup.find_all("html"):
             name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")]
             stable_id = self.get_stable_id(name)
             yield Document(
                 name=name,
                 stable_id=stable_id,
                 text=str(text),
                 meta={"file_name": file_name},
             ), str(text)
Ejemplo n.º 10
0
 def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
     with codecs.open(fp, encoding=self.encoding) as tsv:
         if self.header:
             tsv.readline()
         for line in tsv:
             (doc_name, doc_text) = line.split("\t")
             stable_id = self._get_stable_id(doc_name)
             text = build_node("doc", None, build_node("text", None, doc_text))
             yield Document(
                 name=doc_name,
                 stable_id=stable_id,
                 text=text,
                 meta={"file_name": file_name},
             )
Ejemplo n.º 11
0
 def parse_file(self, fp, file_name):
     with codecs.open(fp, encoding=self.encoding) as f:
         soup = BeautifulSoup(f, "lxml")
         all_html_elements = soup.find_all("html")
         if len(all_html_elements) != 1:
             raise NotImplementedError(
                 "Expecting one html element per html file")
         text = all_html_elements[0]
         name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")]
         stable_id = self.get_stable_id(name)
         yield Document(
             name=name,
             stable_id=stable_id,
             text=str(text),
             meta={"file_name": file_name},
         )
Ejemplo n.º 12
0
 def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
     with codecs.open(fp, encoding=self.encoding) as f:
         soup = BeautifulSoup(f, "lxml")
         all_xml_elements = soup.find_all("pages")
         if len(all_xml_elements) != 1:
             raise NotImplementedError(
                 f"unsupported format file: {file_name}")
         text = all_xml_elements[0]
         name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")]
         stable_id = self._get_stable_id(name)
         yield Document(
             name=name,
             stable_id=stable_id,
             text=str(text),
             meta={"file_name": file_name},
         )
Ejemplo n.º 13
0
def _preprocess_visual_features(doc: Document) -> None:
    if hasattr(doc, "_visual_features"):
        return
    # cache flag
    doc._visual_features = True

    sentence_by_page: DefaultDict[str, List[Sentence]] = defaultdict(list)
    for sentence in doc.sentences:
        sentence_by_page[sentence.page[0]].append(sentence)
        sentence._aligned_lemmas = set()

    for page, sentences in sentence_by_page.items():
        # process per page alignments
        yc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        x0_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        xc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        x1_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        for sentence in sentences:
            sentence.bbox = bbox_from_sentence(sentence)
            sentence.yc = (sentence.bbox.top + sentence.bbox.bottom) / 2
            sentence.x0 = sentence.bbox.left
            sentence.x1 = sentence.bbox.right
            sentence.xc = (sentence.x0 + sentence.x1) / 2
            # index current sentence by different alignment keys
            yc_aligned[sentence.yc].append(sentence)
            x0_aligned[sentence.x0].append(sentence)
            x1_aligned[sentence.x1].append(sentence)
            xc_aligned[sentence.xc].append(sentence)
        for l in yc_aligned.values():
            l.sort(key=lambda p: p.xc)
        for l in x0_aligned.values():
            l.sort(key=lambda p: p.yc)
        for l in x1_aligned.values():
            l.sort(key=lambda p: p.yc)
        for l in xc_aligned.values():
            l.sort(key=lambda p: p.yc)
        _assign_alignment_features(yc_aligned, "Y_")
        _assign_alignment_features(x0_aligned, "LEFT_")
        _assign_alignment_features(x1_aligned, "RIGHT_")
        _assign_alignment_features(xc_aligned, "CENTER_")
    def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
        xml_content = subprocess.check_output(
            f"pdf2txt.py -t xml -M 3 -m 5 -A '{fp}' ", shell=True)
        soup = BeautifulSoup(xml_content, "lxml")
        all_xml_elements = soup.find_all("pages")
        if len(all_xml_elements) != 1:
            raise NotImplementedError(f"unsupported format file: {file_name}")
        text = all_xml_elements[0]
        tree = etree.fromstring(str(text))
        try:
            tree = analysis(tree)
        except Exception as e:
            print(e)
            pass

        name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")]
        stable_id = self._get_stable_id(name)
        print(name)
        yield Document(
            name=name,
            stable_id=stable_id,
            text=etree.tostring(tree),
            meta={"file_name": file_name},
        )
Ejemplo n.º 15
0
def test_parser_skips_and_flattens():
    """Test if ``Parser`` skips/flattens elements."""
    parser_udf = get_parser_udf()

    # Test if a parser skips comments
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "<html><body>Hello!<!-- comment --></body></html>"
    doc = parser_udf.apply(doc)
    assert doc.sentences[0].text == "Hello!"

    # Test if a parser skips blacklisted elements
    doc = Document(id=2, name="test2", stable_id="2::document:0:0")
    doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>"
    doc = parser_udf.apply(doc)
    assert doc.sentences[0].text == "Hello!"

    # Test if a parser flattens elements
    doc = Document(id=3, name="test3", stable_id="3::document:0:0")
    doc.text = "<html><body><span>Hello, <br>world!</span></body></html>"
    doc = parser_udf.apply(doc)
    assert doc.sentences[0].text == "Hello, world!"

    # Now with different blacklist and flatten
    parser_udf = get_parser_udf(blacklist=["meta"], flatten=["word"])

    # Test if a parser does not skip non-blacklisted element
    doc = Document(id=4, name="test4", stable_id="4::document:0:0")
    doc.text = "<html><body><script>alert('Hello');</script><p>Hello!</p></body></html>"
    doc = parser_udf.apply(doc)
    assert doc.sentences[0].text == "alert('Hello');"
    assert doc.sentences[1].text == "Hello!"

    # Test if a parser skips blacklisted elements
    doc = Document(id=5, name="test5", stable_id="5::document:0:0")
    doc.text = "<html><head><meta name='keywords'></head><body>Hello!</body></html>"
    doc = parser_udf.apply(doc)
    assert doc.sentences[0].text == "Hello!"

    # Test if a parser does not flatten elements
    doc = Document(id=6, name="test6", stable_id="6::document:0:0")
    doc.text = "<html><body><span>Hello, <br>world!</span></body></html>"
    doc = parser_udf.apply(doc)
    assert doc.sentences[0].text == "Hello,"
    assert doc.sentences[1].text == "world!"

    # Test if a parser flattens elements
    doc = Document(id=7, name="test7", stable_id="7::document:0:0")
    doc.text = "<html><body><word>Hello, </word><word>world!</word></body></html>"
    doc = parser_udf.apply(doc)
    assert doc.sentences[0].text == "Hello, world!"
Ejemplo n.º 16
0
    def _parse_file(self, fp: str, file_name: str) -> Iterator[Document]:
        # Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check
        def get_prop(node: Tag, name: str) -> Optional[str]:
            title = node["title"]
            if not title:
                return None
            props = title.split(";")
            for prop in props:
                (key, args) = prop.split(None, 1)
                if key == name:
                    return args
            return None

        # Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check
        def get_bbox(node: Tag) -> Tuple[str, ...]:
            bbox = get_prop(node, "bbox")
            if not bbox:
                return None
            return tuple([x for x in bbox.split()])

        with codecs.open(fp, encoding=self.encoding) as f:
            soup = BeautifulSoup(f, "lxml")
        all_html_elements = soup.find_all("html")
        if len(all_html_elements) != 1:
            raise NotImplementedError(
                f"Expecting exactly one html element per html file: {file_name}"
            )
        root = all_html_elements[0]
        capabilities = root.find("meta", attrs={"name": "ocr-capabilities"})
        if capabilities is None:
            raise RuntimeError(
                "The input hOCR does not contain ocr-capabilities metadata.")

        # Unwrap ocr_line/ocrx_line as Fonduer has no data model for lines.
        if "ocr_line" in capabilities["content"]:
            for line in root.find_all(class_="ocr_line"):
                line.unwrap()
        if "ocrx_line" in capabilities["content"]:
            for line in root.find_all(class_="ocrx_line"):
                line.unwrap()

        if "ocrx_word" in capabilities["content"]:
            for p, page in enumerate(root.find_all(class_="ocr_page")):
                ppageno = str(p)  # 0-based
                for word in page.find_all(class_="ocrx_word"):
                    parent = word.parent
                    (left, top, right, bottom) = get_bbox(word)

                    # ocrx_word could have multiple words with one or more of spaces
                    # in-between. This actually happens on Tesseract 4.00.
                    # This is normalized by splitting and concatenating later.
                    tokens = word.text.split()

                    if "left" not in parent.attrs:
                        parent["left"] = []
                        parent["top"] = []
                        parent["right"] = []
                        parent["bottom"] = []
                        parent["ppageno"] = []
                        parent["tokens"] = []
                    parent["left"] += [left] * len(tokens)
                    parent["top"] += [top] * len(tokens)
                    parent["right"] += [right] * len(tokens)
                    parent["bottom"] += [bottom] * len(tokens)
                    parent["ppageno"] += [ppageno] * len(tokens)
                    parent["tokens"] += tokens

                    if "ocrp_wconf" in capabilities["content"]:
                        x_wconf = get_prop(word, "x_wconf")
                        if "x_wconf" not in parent.attrs:
                            parent["x_wconf"] = []
                        parent["x_wconf"].append(x_wconf)
                    # Mark the parent element
                    if "fonduer" not in parent.attrs:
                        parent["fonduer"] = ["1"]

                    # Concat words again with " " or "".
                    if len(tokens) > 1:
                        if self.space:
                            word.string.replace_with(" ".join(tokens))
                        else:
                            word.string.replace_with("".join(tokens))
                    word.unwrap()

            # Clean-up
            for i, parent in enumerate(root.find_all(attrs={"fonduer": "1"})):
                # Concat consecutive NavigableString
                parent.smooth()  # beautifulsoup4 >= 4.8.0

                # Remove linebreaks and excess spaces
                # in reverse order b/c removing element from list in loop
                for child in reversed(parent.contents):
                    if isinstance(child, Comment):  # remove comments
                        child.extract()
                    elif isinstance(child, NavigableString):
                        if child.strip() == "":  # remove if space or linebreak
                            child.extract()
                        else:
                            tmp = re.sub(r"[\n\s]+", " " if self.space else "",
                                         child)
                            n = NavigableString(tmp.strip())
                            child.replace_with(n)
                del parent["fonduer"]

        name = os.path.basename(fp)[:os.path.basename(fp).rfind(".")]
        stable_id = self._get_stable_id(name)
        yield Document(
            name=name,
            stable_id=stable_id,
            text=str(root),
            meta={"file_name": file_name},
        )
Ejemplo n.º 17
0
    def parse(self, document: Document, text: str) -> Iterator[Sentence]:
        """Depth-first search over the provided tree.

        Implemented as an iterative procedure. The structure of the state
        needed to parse each node is also defined in this function.

        :param document: the Document context
        :param text: the structured text of the document (e.g. HTML)
        :return: a *generator* of Sentences.
        """
        stack = []

        root = lxml.html.fromstring(text)

        # flattens children of node that are in the 'flatten' list
        if self.flatten:
            lxml.etree.strip_tags(root, self.flatten)
        # Strip comments
        lxml.etree.strip_tags(root, lxml.etree.Comment)
        # Assign the text, which was stripped of the 'flatten'-tags, to the document
        document.text = lxml.etree.tostring(root, encoding="unicode")

        # This dictionary contain the global state necessary to parse a
        # document and each context element. This reflects the relationships
        # defined in parser/models. This contains the state necessary to create
        # the respective Contexts within the document.
        state = {
            "visited": set(),
            "parent": {},  # map of parent[child] = node used to discover child
            "context":
            {},  # track the Context of each node (context['td'] = Cell)
            "root": root,
            "document": document,
            "section": {
                "idx": 0
            },
            "paragraph": {
                "idx": 0
            },
            "figure": {
                "idx": 0
            },
            "caption": {
                "idx": 0
            },
            "table": {
                "idx": 0
            },
            "sentence": {
                "idx": 0,
                "abs_offset": 0
            },
        }
        # NOTE: Currently the helper functions directly manipulate the state
        # rather than returning a modified copy.

        # Iterative Depth-First Search
        stack.append(root)
        state["parent"][root] = document
        state["context"][root] = document

        tokenized_sentences: List[Sentence] = []
        while stack:
            node = stack.pop()
            if node not in state["visited"]:
                state["visited"].add(node)  # mark as visited

                # Process
                if self.lingual:
                    tokenized_sentences += [
                        y for y in self._parse_node(node, state)
                    ]
                else:
                    yield from self._parse_node(node, state)

                # NOTE: This reversed() order is to ensure that the iterative
                # DFS matches the order that would be produced by a recursive
                # DFS implementation.
                for child in reversed(node):
                    # Skip nodes that are blacklisted
                    if self.blacklist and child.tag in self.blacklist:
                        continue

                    stack.append(child)

                    # store the parent of the node, which is either the parent
                    # Context, or if the parent did not create a Context, then
                    # use the node's parent Context.
                    state["parent"][child] = (state["context"][node]
                                              if node in state["context"] else
                                              state["parent"][node])

        if self.lingual:
            yield from self.lingual_parser.enrich_sentences_with_NLP(
                tokenized_sentences)
Ejemplo n.º 18
0
def test_ner_matchers():
    """Test different ner type matchers."""
    # Set up a document
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = " ".join([
        "Tim Cook was born in USA in 1960.",
        "He is the CEO of Apple.",
        "He sold 100 million of iPhone.",
    ])
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Manually attach ner_tags as the result from spacy may fluctuate.
    doc.sentences[0].ner_tags = [
        "PERSON",
        "PERSON",
        "O",
        "O",
        "O",
        "GPE",
        "O",
        "DATE",
        "O",
    ]
    doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"]
    doc.sentences[2].ner_tags = [
        "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O"
    ]

    # the length of words and that of ner_tags should match.
    assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags)
    assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags)

    space = MentionNgrams(n_min=1, n_max=2)

    # Test if PersonMatcher works as expected
    matcher = PersonMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"}

    # Test if LocationMatcher works as expected
    matcher = LocationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"USA"}

    # Test if DateMatcher works as expected
    matcher = DateMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"1960"}

    # Test if OrganizationMatcher works as expected
    matcher = OrganizationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Apple"}

    # Test if NumberMatcher works as expected
    matcher = NumberMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"100 million"}

    # Test if MiscMatcher works as expected
    matcher = MiscMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"iPhone"}