Beispiel #1
0
    def test_root_annotations(self):
        lines = [
            LineWithMeta(
                line="bold text\n",
                hierarchy_level=HierarchyLevel.create_root(),
                metadata=ParagraphMetadata(paragraph_type="root",
                                           predicted_classes=None,
                                           page_id=0,
                                           line_id=0),
                annotations=[BoldAnnotation(start=0, end=10, value="True")]),
            LineWithMeta(
                line="italic text\n",
                hierarchy_level=HierarchyLevel.create_root(),
                metadata=ParagraphMetadata(paragraph_type="root",
                                           predicted_classes=None,
                                           page_id=0,
                                           line_id=1),
                annotations=[ItalicAnnotation(start=0, end=12, value="True")]),
        ]
        node = TreeNode.create(lines=lines)
        node_annotations = node.get_root().annotations
        node_annotations.sort(key=lambda a: a.start)
        self.assertEqual(2, len(node_annotations))
        bold, italic = node_annotations
        self.assertEqual(BoldAnnotation.name, bold.name)
        self.assertEqual("True", bold.value)
        self.assertEqual(0, bold.start)
        self.assertEqual(10, bold.end)

        self.assertEqual(ItalicAnnotation.name, italic.name)
        self.assertEqual("True", italic.value)
        self.assertEqual(10, italic.start)
        self.assertEqual(22, italic.end)
Beispiel #2
0
    def _get_lines_with_meta(
            self, paragraph_list: List[Paragraph]) -> List[LineWithMeta]:
        """
        :param paragraph_list: list of Paragraph
        :return: list of LineWithMeta
        """
        lines_with_meta = []
        paragraph_id = 0

        for paragraph in paragraph_list:

            # line with meta:
            # {"text": "",
            #  "type": ""("paragraph" ,"list_item", "raw_text", "style_header"),
            #  "level": (1,1) or None (hierarchy_level),
            #  "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]}
            paragraph_properties = ParagraphInfo(paragraph)
            line_with_meta = paragraph_properties.get_info()

            text = line_with_meta["text"]
            uid = '{}_{}'.format(self.path_hash, line_with_meta["uid"])

            paragraph_type = line_with_meta["type"]
            level = line_with_meta["level"]
            if level:
                hierarchy_level = HierarchyLevel(level[0], level[1], False,
                                                 paragraph_type)
            else:
                hierarchy_level = HierarchyLevel(None, None, False, "raw_text")

            dict2annotations = {
                "bold": BoldAnnotation,
                "italic": ItalicAnnotation,
                "underlined": UnderlinedAnnotation,
                "size": SizeAnnotation,
                "indentation": IndentationAnnotation,
                "alignment": AlignmentAnnotation,
                "style": StyleAnnotation
            }
            annotations = []
            for annotation in line_with_meta["annotations"]:
                annotations.append(
                    dict2annotations[annotation[0]](*annotation[1:]))

            paragraph_id += 1
            metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                         predicted_classes=None,
                                         page_id=0,
                                         line_id=paragraph_id)

            lines_with_meta.append(
                LineWithMeta(line=text,
                             hierarchy_level=hierarchy_level,
                             metadata=metadata,
                             annotations=annotations,
                             uid=uid))
            lines_with_meta = self.hierarchy_level_extractor.get_hierarchy_level(
                lines_with_meta)
        return lines_with_meta
 def test_equal(self):
     h1 = HierarchyLevel(level_1=3, level_2=3, can_be_multiline=True, paragraph_type="header")
     h2 = HierarchyLevel(level_1=3, level_2=3, can_be_multiline=True, paragraph_type="header")
     self.assertFalse(h1 < h2)
     self.assertTrue(h1 <= h2)
     self.assertFalse(h1 > h2)
     self.assertTrue(h1 >= h2)
     self.assertTrue(h1 == h2)
 def test_one_greater_than_other_level2(self):
     h1 = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, paragraph_type="list_item")
     h2 = HierarchyLevel(level_1=2, level_2=2, can_be_multiline=False, paragraph_type="list_item")
     self.assertTrue(h1 < h2)
     self.assertTrue(h1 <= h2)
     self.assertFalse(h1 > h2)
     self.assertFalse(h1 >= h2)
     self.assertFalse(h1 == h2)
 def test_raw_text_greater_than_any_other(self):
     list_item = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, paragraph_type="list_item")
     raw_text = HierarchyLevel.create_raw_text()
     self.assertFalse(list_item > raw_text)
     self.assertFalse(list_item >= raw_text)
     self.assertFalse(list_item == raw_text)
     self.assertTrue(list_item < raw_text)
     self.assertTrue(list_item <= raw_text)
 def test_two_raw_text(self):
     h1 = HierarchyLevel.create_raw_text()
     h2 = HierarchyLevel.create_raw_text()
     h3 = HierarchyLevel(level_1=1, level_2=2, can_be_multiline=False, paragraph_type="raw_text")
     self.assertTrue(h1 == h2)
     self.assertTrue(h1 >= h2)
     self.assertTrue(h1 <= h2)
     self.assertTrue(h1 == h3)
     self.assertTrue(h1 >= h3)
     self.assertTrue(h1 <= h3)
Beispiel #7
0
    def create(lines: List[LineWithMeta] = None) -> "TreeNode":
        """
        Creates a root node with given text
        :param lines: this lines should be the title of the document (or should be empty for documents without title)
        :return: root of the document tree
        """
        page_id = 0 if len(lines) == 0 else min(
            (line.metadata.page_id for line in lines))
        line_id = 0 if len(lines) == 0 else min(
            (line.metadata.line_id for line in lines))

        texts = (line.line for line in lines)
        annotations = []
        text_length = 0
        for line in lines:
            annotations.extend(
                TreeNode.__shift_annotations(line=line,
                                             text_length=text_length))
            text_length += len(line.line)
        text = "".join(texts)
        metadata = ParagraphMetadata(paragraph_type="root",
                                     page_id=page_id,
                                     line_id=line_id,
                                     predicted_classes=None)
        return TreeNode("0",
                        text,
                        annotations=annotations,
                        metadata=metadata,
                        subparagraphs=[],
                        hierarchy_level=HierarchyLevel.create_root(),
                        parent=None)
Beispiel #8
0
 def insert_table(self, document: UnstructuredDocument) -> UnstructuredDocument:
     """
     takes a document as the input and insert table cells into the paragraphs list.
     Does not insert table if it already was inserted
     """
     tables_dict = {table.metadata.uid: table for table in document.tables if not table.metadata.is_inserted}
     paragraphs = []
     hierarchy_level = max((line.hierarchy_level.level_1 for line in document.lines
                            if not line.hierarchy_level.is_raw_text()), default=0)
     hierarchy_level_raw_text = HierarchyLevel(level_1=hierarchy_level + 1,
                                               level_2=0,
                                               can_be_multiline=True,
                                               paragraph_type=HierarchyLevel.raw_text)
     for line in document.lines:
         if line.hierarchy_level.is_raw_text():
             line.set_hierarchy_level(hierarchy_level_raw_text)
         paragraphs.append(line)
         for annotation in line.annotations:
             if annotation.name == TableAnnotation.name:
                 table_id = annotation.value
                 if table_id in tables_dict:
                     table = tables_dict[table_id]
                     paragraphs += self._create_paragraphs_from_table(table=table, hierarchy_level=hierarchy_level)
                     tables_dict.pop(table_id)
     for table in tables_dict.values():
         paragraphs += self._create_paragraphs_from_table(table=table, hierarchy_level=hierarchy_level)
     return UnstructuredDocument(lines=paragraphs, tables=document.tables, attachments=document.attachments)
Beispiel #9
0
 def __get_line(self,
                text: str,
                level1: int,
                level2: int,
                hl: str = "list") -> LineWithMeta:
     hierarchy_level = HierarchyLevel(level1, level2, False, hl)
     metadata = ParagraphMetadata("list_item", None, 0, None)
     return LineWithMeta(text,
                         hierarchy_level=hierarchy_level,
                         metadata=metadata,
                         annotations=[])
Beispiel #10
0
 def __create_list_line(line: LineWithMeta):
     return LineWithMeta(line="",
                         hierarchy_level=HierarchyLevel(
                             level_1=line.hierarchy_level.level_1,
                             level_2=line.hierarchy_level.level_2 - 0.5,  # noqa  it is intentionaly for lists
                             paragraph_type="list",
                             can_be_multiline=False
                         ),
                         metadata=ParagraphMetadata(paragraph_type="list",
                                                    page_id=line.metadata.page_id,
                                                    line_id=line.metadata.line_id,
                                                    predicted_classes=None),
                         annotations=[])
Beispiel #11
0
 def _create_cell_line(table: Table, hierarchy_level: int, cell: str) -> LineWithMeta:
     hierarchy_level_new = HierarchyLevel(
         level_1=hierarchy_level + 3,
         level_2=0,
         can_be_multiline=False,
         paragraph_type="table_cell"
     )
     metadata = ParagraphMetadata(paragraph_type="table_cell",
                                  predicted_classes=None,
                                  page_id=table.metadata.page_id,
                                  line_id=None)
     return LineWithMeta(line=cell,
                         hierarchy_level=hierarchy_level_new,
                         metadata=metadata,
                         annotations=[])
Beispiel #12
0
 def _create_table_line(table: Table, hierarchy_level: int) -> LineWithMeta:
     hierarchy_level_new = HierarchyLevel(
         level_1=hierarchy_level + 1,
         level_2=0,
         can_be_multiline=False,
         paragraph_type="table"
     )
     metadata = ParagraphMetadata(paragraph_type="table",
                                  predicted_classes=None,
                                  page_id=table.metadata.page_id,
                                  line_id=None)
     return LineWithMeta(line="",
                         hierarchy_level=hierarchy_level_new,
                         metadata=metadata,
                         annotations=[],
                         uid="table_{}".format(table.metadata.uid))
Beispiel #13
0
 def create(texts: Iterable[str]) -> "TreeNode":
     """
     Creates a root node with given text
     :param texts: this text should be the title of the document (or should be empty for documents without title)
     :return: root of the document tree
     """
     text = "\n".join(texts)
     metadata = ParagraphMetadata(paragraph_type="root",
                                  page_id=0,
                                  line_id=0,
                                  predicted_classes=None)
     hierarchy_level = HierarchyLevel(0, 0, True, paragraph_type="root")
     return TreeNode("0",
                     text,
                     annotations=[],
                     metadata=metadata,
                     subparagraphs=[],
                     hierarchy_level=hierarchy_level,
                     parent=None)
Beispiel #14
0
 def __handle_one_element(self, depth: int, value, paragraph_type: str,
                          paragraph_type_meta):
     if depth == 1 and paragraph_type == "title":
         level1 = 0
         level2 = 0
     else:
         level1 = depth
         level2 = 1
     hierarchy_level = HierarchyLevel(level_1=level1,
                                      level_2=level2,
                                      can_be_multiline=False,
                                      paragraph_type=paragraph_type_meta)
     metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                  predicted_classes=None,
                                  page_id=0,
                                  line_id=None)
     line = LineWithMeta(line=self.__get_text(value),
                         hierarchy_level=hierarchy_level,
                         metadata=metadata,
                         annotations=[])
     return line
Beispiel #15
0
    def _get_lines_with_meta(
        self, hierarchy_level_extractor: HierarchyLevelExtractor
    ) -> List[LineWithMeta]:
        """
        :param paragraph_list: list of Paragraph
        :return: list of LineWithMeta
        """
        lines_with_meta = []
        paragraph_id = 0

        for i, paragraph in enumerate(self.paragraph_list):
            # line with meta:
            # {"text": "",
            #  "type": ""("paragraph" ,"list_item", "raw_text", "style_header"),
            #  "level": (1,1) or None (hierarchy_level),
            #  "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]}
            paragraph_properties = ParagraphInfo(paragraph)
            line_with_meta = paragraph_properties.get_info()

            text = line_with_meta["text"]

            paragraph_type = line_with_meta["type"]
            level = line_with_meta["level"]
            if level:
                hierarchy_level = HierarchyLevel(level[0], level[1], False,
                                                 paragraph_type)
            else:
                hierarchy_level = HierarchyLevel.create_raw_text()

            dict2annotations = {
                "bold": BoldAnnotation,
                "italic": ItalicAnnotation,
                "underlined": UnderlinedAnnotation,
                "size": SizeAnnotation,
                "indentation": IndentationAnnotation,
                "alignment": AlignmentAnnotation,
                "style": StyleAnnotation,
            }

            annotations = []
            for annotation in line_with_meta["annotations"]:
                annotations.append(
                    dict2annotations[annotation[0]](*annotation[1:]))

            for object_dict in [self.image_refs, self.diagram_refs]:
                if i in object_dict:
                    for object_uid in object_dict[i]:
                        annotation = AttachAnnotation(attach_uid=object_uid,
                                                      start=0,
                                                      end=len(text))
                        annotations.append(annotation)

            if i in self.table_refs:
                for table_uid in self.table_refs[i]:
                    annotation = TableAnnotation(name=table_uid,
                                                 start=0,
                                                 end=len(text))
                    annotations.append(annotation)

            paragraph_id += 1
            metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                         predicted_classes=None,
                                         page_id=0,
                                         line_id=paragraph_id)

            lines_with_meta.append(
                LineWithMeta(line=text,
                             hierarchy_level=hierarchy_level,
                             metadata=metadata,
                             annotations=annotations,
                             uid=paragraph.uid))
            lines_with_meta = hierarchy_level_extractor.get_hierarchy_level(
                lines_with_meta)
        return lines_with_meta
Beispiel #16
0
# but unstructured document consist of flat list of lines with text and metadata
# hierarchy structure hidden in HierarchyLevel attribute of LineWithMeta
# lets build firs line, it is document tree root:
text = "DOCUMENT TITLE"
metadata = ParagraphMetadata(paragraph_type="title",
                             predicted_classes=None,
                             page_id=0,
                             line_id=0)
# hierarchy level define position of this line in document tree.

hierarchy_level = HierarchyLevel(
    # most important parameters of HierarchyLevel is level_1 and level_2
    # hierarchy level compares by tuple (level_1, level_2) lesser -> closer to the root of the tree
    level_1=0,
    level_2=0,
    # can_be_multiline and paragraph_type - some parts of the document (for example title) may take more
    # than one line
    # if can_be_multiline is true than several lines in a row with same level_1, level_2 and paragraph_type
    # will be merged in one tree node
    can_be_multiline=True,
    paragraph_type="title")

# Annotations: one may specify some information about some part of the text, for example that some word
# written in italic font.
annotations = []

line1 = LineWithMeta(line=text,
                     hierarchy_level=hierarchy_level,
                     metadata=metadata,
                     annotations=annotations)