Example #1
0
    def get_stable_id(self) -> str:
        """
        Return a stable id.

        :rtype: string
        """
        return construct_stable_id(self.section, self._get_polymorphic_identity(), 0, 0)
Example #2
0
    def parse(self, document, contents):
        """Parse the document.

        :param document: The Document context of the data model.
        :param contents: The text contents of the document.
        :rtype: a *generator* of tokenized text.
        """
        i = 0
        for text in contents.split(self.delim):
            if not len(text.strip()):
                continue
            words = text.split()
            char_offsets = [0] + [
                int(_) for _ in np.cumsum([len(x) + 1 for x in words])[:-1]
            ]
            text = " ".join(words)
            stable_id = construct_stable_id(document, "sentence", i, i)
            yield {
                "text": text,
                "words": words,
                "pos_tags": [""] * len(words),
                "ner_tags": [""] * len(words),
                "lemmas": [""] * len(words),
                "dep_parents": [0] * len(words),
                "dep_labels": [""] * len(words),
                "char_offsets": char_offsets,
                "abs_char_offsets": char_offsets,
                "stable_id": stable_id,
            }
            i += 1
Example #3
0
 def get_stable_id(self) -> str:
     """Return a stable id."""
     return construct_stable_id(
         self.sentence,
         self._get_polymorphic_identity(),
         self.char_start,
         self.char_end,
     )
    def _parse_sentence(self, paragraph, node, state):
        """Parse the Sentences of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """

        # Set name for Sentence
        name = node.attrib["name"] if "name" in node.attrib else None

        # Lingual Parse
        document = state["document"]
        sens_parts = []
        sens_words_id = []
        for sentence in node:
            parts = defaultdict(list)
            parts["document"] = document
            flag = 0
            text = ""
            words = []
            char_abs_offsets = []
            start = 0
            for i, word in enumerate(sentence):
                w = ""
                for char in word:
                    if "bbox" in char.attrib.keys():
                        flag = 1
                    w += char.text
                words.append(w)
                char_abs_offsets.append(start)
                start += (1 + len(word))
                text += re.sub("\s+", " ", w)
                text += " "
            if not flag:
                continue
            if text.isspace():
                continue
            if not any(p and p[0].isalnum() for p in words):
                continue
            if not text:
                continue

            for i, word in enumerate(sentence):
                parts["words"].append(words[i].replace(" ", "_"))
                parts["lemmas"].append(words[i].replace(" ", "_"))
                parts["ner_tags"].append(
                    "")  # placeholder for later NLP parsing
                parts["char_offsets"].append(char_abs_offsets[i])
                # parts["abs_char_offsets"].append(char_abs_offsets[i])
                parts["dep_parents"].append(
                    0)  # placeholder for later NLP parsing
                parts["dep_labels"].append(
                    "")  # placeholder for later NLP parsing

            parts["text"], parts["pos_tags"] = self.lingual_parser.tagger(
                text[:-1])

            abs_offset = state["sentence"]["abs_offset"]
            parts["abs_char_offsets"] = [
                char_offset + abs_offset
                for char_offset in parts["char_offsets"]
            ]
            parts["position"] = state["sentence"]["idx"]

            if self.tabular:
                parts["position"] = state["sentence"]["idx"]

                # If tabular, consider own Context first in case a Cell
                # was just created. Otherwise, defer to the parent.
                parent = paragraph
                if isinstance(parent, Paragraph):
                    parts["section"] = parent.section
                    parts["paragraph"] = parent
                else:
                    raise NotImplementedError(
                        "Sentence parent must be Paragraph.")

            if self.structural:
                context_node = sentence
                tree = lxml.etree.ElementTree(state["root"])
                parts["xpath"] = tree.getpath(context_node)
                parts["html_tag"] = context_node.tag
                parts["html_attrs"] = []
                temp_attrs = []
                for word in sentence:
                    if len(word) == 0:
                        continue
                    t = ""
                    for k, v in word[0].attrib.items():
                        if k != "bbox":
                            v = v.replace(" ", "")
                            t = t + k + "=" + v + " "
                    t = t[:-1]
                    temp_attrs.append(t)
                for temp_attr in temp_attrs:
                    parts["html_attrs"].append(temp_attr)

            if self.visual:
                page = []
                top = []
                left = []
                right = []
                bottom = []

                p = int(node.getparent().get("id"))
                bbox = node.getparent().get("bbox")
                bbox = bbox.split(",")
                height = int(round(float(bbox[3])))

                # hack for handle error coordinate in sentence
                flag = False
                try:
                    for word in sentence:
                        if len(word) == 0:
                            continue
                        coord_f = word[0].attrib[
                            "bbox"]  # coordinate first character of word
                        coord_l = word[-1].attrib["bbox"]
                        coord_f = coord_f.split(",")
                        coord_l = coord_l.split(",")
                        page.append(p)
                        left.append(int(round(float(coord_f[0]))))
                        bottom.append(height - int(round(float(coord_f[1]))))
                        right.append(int(round(float(coord_l[2]))))
                        if height > int(round(float(coord_f[3]))):
                            top.append(height - int(round(float(coord_f[3]))))
                        else:
                            top.append(0)
                    parts["page"] = page
                    parts["left"] = left
                    parts["top"] = top
                    parts["right"] = right
                    parts["bottom"] = bottom
                except Exception as e:
                    print(e)
                    print(document, "\n", text)
                    continue

                abs_sentence_offset_end = (state["sentence"]["abs_offset"] +
                                           parts["char_offsets"][-1] +
                                           len(parts["words"][-1]))
                parts["stable_id"] = construct_stable_id(
                    document,
                    "sentence",
                    state["sentence"]["abs_offset"],
                    abs_sentence_offset_end,
                )
                state["sentence"]["idx"] += 1
                state["sentence"]["abs_offset"] = abs_sentence_offset_end
                parts["name"] = name

            yield Sentence(**parts)
Example #5
0
 def get_stable_id(self) -> str:
     """Return a stable id."""
     return construct_stable_id(self.paragraph,
                                self._get_polymorphic_identity(), 0, 0)
Example #6
0
    def _parse_sentence(self, paragraph: Paragraph, node: HtmlElement,
                        state: Dict[str, Any]) -> Iterator[Sentence]:
        """Parse the Sentences of the node.

        :param node: The lxml node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        text = state["paragraph"]["text"]
        field = state["paragraph"]["field"]

        # Set name for Sentence
        name = node.attrib["name"] if "name" in node.attrib else None

        # Lingual Parse
        document = state["document"]
        for parts in self.lingual_parser.split_sentences(text):
            abs_offset = state["sentence"]["abs_offset"]
            parts["abs_char_offsets"] = [
                char_offset + abs_offset
                for char_offset in parts["char_offsets"]
            ]
            parts["document"] = document
            # NOTE: Why do we overwrite this from the spacy parse?
            parts["position"] = state["sentence"]["idx"]
            abs_sentence_offset_end = (state["sentence"]["abs_offset"] +
                                       parts["char_offsets"][-1] +
                                       len(parts["words"][-1]))
            parts["stable_id"] = construct_stable_id(
                document,
                "sentence",
                state["sentence"]["abs_offset"],
                abs_sentence_offset_end,
            )
            parts["name"] = name
            state["sentence"]["abs_offset"] = abs_sentence_offset_end
            if self.structural:
                context_node = node.getparent() if field == "tail" else node
                tree = lxml.etree.ElementTree(state["root"])
                parts["xpath"] = tree.getpath(context_node)
                parts["html_tag"] = context_node.tag
                parts["html_attrs"] = [
                    "=".join(x) for x in list(context_node.attrib.items())
                ]

                # Extending html style attribute with the styles
                # from inline style class for the element.
                cur_style_index = None
                for index, attr in enumerate(parts["html_attrs"]):
                    if attr.find("style") >= 0:
                        cur_style_index = index
                        break
                head = state["root"].find("head")
                styles = None
                if head is not None:
                    styles = head.find("style")
                if styles is not None:
                    for x in list(context_node.attrib.items()):
                        if x[0] == "class":
                            exp = r"(." + x[1] + r")([\n\s\r]*)\{(.*?)\}"
                            r = re.compile(exp, re.DOTALL)
                            if r.search(styles.text) is not None:
                                if cur_style_index is not None:
                                    parts["html_attrs"][cur_style_index] += (
                                        r.search(styles.text).group(3).replace(
                                            "\r",
                                            "").replace("\n",
                                                        "").replace("\t", ""))
                                else:
                                    parts["html_attrs"].extend([
                                        "style=" + re.sub(
                                            r"\s{1,}",
                                            " ",
                                            r.search(
                                                styles.text).group(3).replace(
                                                    "\r", "").replace(
                                                        "\n", "").replace(
                                                            "\t", "").strip(),
                                        )
                                    ])
                            break
            parts["position"] = state["sentence"]["idx"]

            # If tabular, consider own Context first in case a Cell
            # was just created. Otherwise, defer to the parent.
            parent = paragraph
            if isinstance(parent, Paragraph):
                parts["section"] = parent.section
                parts["paragraph"] = parent
                if parent.cell:  # if True self.tabular is also always True
                    parts["table"] = parent.cell.table
                    parts["cell"] = parent.cell
                    parts["row_start"] = parent.cell.row_start
                    parts["row_end"] = parent.cell.row_end
                    parts["col_start"] = parent.cell.col_start
                    parts["col_end"] = parent.cell.col_end
            else:
                raise NotImplementedError("Sentence parent must be Paragraph.")
            yield Sentence(**parts)
            state["sentence"]["idx"] += 1