Exemple #1
0
    def test_root_annotations(self):
        lines = [
            LineWithMeta(
                line="bold text\n",
                hierarchy_level=HierarchyLevel.create_root(),
                metadata=ParagraphMetadata(paragraph_type="root",
                                           predicted_classes=None,
                                           page_id=0,
                                           line_id=0),
                annotations=[BoldAnnotation(start=0, end=10, value="True")]),
            LineWithMeta(
                line="italic text\n",
                hierarchy_level=HierarchyLevel.create_root(),
                metadata=ParagraphMetadata(paragraph_type="root",
                                           predicted_classes=None,
                                           page_id=0,
                                           line_id=1),
                annotations=[ItalicAnnotation(start=0, end=12, value="True")]),
        ]
        node = TreeNode.create(lines=lines)
        node_annotations = node.get_root().annotations
        node_annotations.sort(key=lambda a: a.start)
        self.assertEqual(2, len(node_annotations))
        bold, italic = node_annotations
        self.assertEqual(BoldAnnotation.name, bold.name)
        self.assertEqual("True", bold.value)
        self.assertEqual(0, bold.start)
        self.assertEqual(10, bold.end)

        self.assertEqual(ItalicAnnotation.name, italic.name)
        self.assertEqual("True", italic.value)
        self.assertEqual(10, italic.start)
        self.assertEqual(22, italic.end)
Exemple #2
0
    def create(lines: List[LineWithMeta] = None) -> "TreeNode":
        """
        Creates a root node with given text
        :param lines: this lines should be the title of the document (or should be empty for documents without title)
        :return: root of the document tree
        """
        page_id = 0 if len(lines) == 0 else min(
            (line.metadata.page_id for line in lines))
        line_id = 0 if len(lines) == 0 else min(
            (line.metadata.line_id for line in lines))

        texts = (line.line for line in lines)
        annotations = []
        text_length = 0
        for line in lines:
            annotations.extend(
                TreeNode.__shift_annotations(line=line,
                                             text_length=text_length))
            text_length += len(line.line)
        text = "".join(texts)
        metadata = ParagraphMetadata(paragraph_type="root",
                                     page_id=page_id,
                                     line_id=line_id,
                                     predicted_classes=None)
        return TreeNode("0",
                        text,
                        annotations=annotations,
                        metadata=metadata,
                        subparagraphs=[],
                        hierarchy_level=HierarchyLevel.create_root(),
                        parent=None)
Exemple #3
0
    def read(self,
             path: str,
             document_type: Optional[str] = None,
             parameters: Optional[dict] = None) -> UnstructuredDocument:
        prs = Presentation(path)
        lines, tables = [], []

        for page_id, slide in enumerate(prs.slides, start=1):
            for paragraph_id, shape in enumerate(slide.shapes, start=1):

                if shape.has_text_frame:
                    metadata = ParagraphMetadata(paragraph_type="raw_text",
                                                 predicted_classes=None,
                                                 page_id=page_id,
                                                 line_id=paragraph_id)
                    lines.append(
                        LineWithMeta(line=shape.text,
                                     hierarchy_level=None,
                                     metadata=metadata,
                                     annotations=[]))

                if shape.has_table:
                    cells = [[cell.text for cell in row.cells]
                             for row in shape.table.rows]
                    metadata = TableMetadata(page_id=page_id)
                    tables.append(Table(cells=cells, metadata=metadata))

        lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
        return UnstructuredDocument(lines=lines, tables=tables, attachments=[])
Exemple #4
0
    def _get_lines_with_meta(
            self, paragraph_list: List[Paragraph]) -> List[LineWithMeta]:
        """
        :param paragraph_list: list of Paragraph
        :return: list of LineWithMeta
        """
        lines_with_meta = []
        paragraph_id = 0

        for paragraph in paragraph_list:

            # line with meta:
            # {"text": "",
            #  "type": ""("paragraph" ,"list_item", "raw_text", "style_header"),
            #  "level": (1,1) or None (hierarchy_level),
            #  "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]}
            paragraph_properties = ParagraphInfo(paragraph)
            line_with_meta = paragraph_properties.get_info()

            text = line_with_meta["text"]
            uid = '{}_{}'.format(self.path_hash, line_with_meta["uid"])

            paragraph_type = line_with_meta["type"]
            level = line_with_meta["level"]
            if level:
                hierarchy_level = HierarchyLevel(level[0], level[1], False,
                                                 paragraph_type)
            else:
                hierarchy_level = HierarchyLevel(None, None, False, "raw_text")

            dict2annotations = {
                "bold": BoldAnnotation,
                "italic": ItalicAnnotation,
                "underlined": UnderlinedAnnotation,
                "size": SizeAnnotation,
                "indentation": IndentationAnnotation,
                "alignment": AlignmentAnnotation,
                "style": StyleAnnotation
            }
            annotations = []
            for annotation in line_with_meta["annotations"]:
                annotations.append(
                    dict2annotations[annotation[0]](*annotation[1:]))

            paragraph_id += 1
            metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                         predicted_classes=None,
                                         page_id=0,
                                         line_id=paragraph_id)

            lines_with_meta.append(
                LineWithMeta(line=text,
                             hierarchy_level=hierarchy_level,
                             metadata=metadata,
                             annotations=annotations,
                             uid=uid))
            lines_with_meta = self.hierarchy_level_extractor.get_hierarchy_level(
                lines_with_meta)
        return lines_with_meta
Exemple #5
0
 def __get_line(self,
                text: str,
                level1: int,
                level2: int,
                hl: str = "list") -> LineWithMeta:
     hierarchy_level = HierarchyLevel(level1, level2, False, hl)
     metadata = ParagraphMetadata("list_item", None, 0, None)
     return LineWithMeta(text,
                         hierarchy_level=hierarchy_level,
                         metadata=metadata,
                         annotations=[])
Exemple #6
0
 def _get_lines_with_meta(self, path: str) -> List[LineWithMeta]:
     lines = []
     file_hash = calculate_file_hash(path=path)
     for line_id, line in self._get_lines(path):
         metadata = ParagraphMetadata(page_id=0,
                                      line_id=line_id,
                                      predicted_classes=None,
                                      paragraph_type="raw_text")
         uid = "txt_{}_{}".format(file_hash, line_id)
         line_with_meta = LineWithMeta(line=line, hierarchy_level=None, metadata=metadata, annotations=[], uid=uid)
         lines.append(line_with_meta)
     return lines
Exemple #7
0
 def __create_list_line(line: LineWithMeta):
     return LineWithMeta(line="",
                         hierarchy_level=HierarchyLevel(
                             level_1=line.hierarchy_level.level_1,
                             level_2=line.hierarchy_level.level_2 - 0.5,  # noqa  it is intentionaly for lists
                             paragraph_type="list",
                             can_be_multiline=False
                         ),
                         metadata=ParagraphMetadata(paragraph_type="list",
                                                    page_id=line.metadata.page_id,
                                                    line_id=line.metadata.line_id,
                                                    predicted_classes=None),
                         annotations=[])
Exemple #8
0
 def _create_cell_line(table: Table, hierarchy_level: int, cell: str) -> LineWithMeta:
     hierarchy_level_new = HierarchyLevel(
         level_1=hierarchy_level + 3,
         level_2=0,
         can_be_multiline=False,
         paragraph_type="table_cell"
     )
     metadata = ParagraphMetadata(paragraph_type="table_cell",
                                  predicted_classes=None,
                                  page_id=table.metadata.page_id,
                                  line_id=None)
     return LineWithMeta(line=cell,
                         hierarchy_level=hierarchy_level_new,
                         metadata=metadata,
                         annotations=[])
Exemple #9
0
 def _create_table_line(table: Table, hierarchy_level: int) -> LineWithMeta:
     hierarchy_level_new = HierarchyLevel(
         level_1=hierarchy_level + 1,
         level_2=0,
         can_be_multiline=False,
         paragraph_type="table"
     )
     metadata = ParagraphMetadata(paragraph_type="table",
                                  predicted_classes=None,
                                  page_id=table.metadata.page_id,
                                  line_id=None)
     return LineWithMeta(line="",
                         hierarchy_level=hierarchy_level_new,
                         metadata=metadata,
                         annotations=[],
                         uid="table_{}".format(table.metadata.uid))
Exemple #10
0
 def get_api_dict(api: Api,
                  depth: int = 0,
                  name: str = 'TreeNode') -> Model:
     return api.model(
         name, {
             'node_id':
             fields.String(
                 description=
                 "Document element identifier. It is unique within one tree (i.e. "
                 "there will be no other such node_id in this tree, but in attachment "
                 "it may occur) The identifier has the form 0.2.1 where each number "
                 "means a serial number at the corresponding level of the hierarchy.",
                 required=True,
                 example="0.2.1"),
             'text':
             fields.String(description="text of node",
                           required=True,
                           example="Закон"),
             'annotations':
             fields.List(
                 fields.Nested(Annotation.get_api_dict(api),
                               description="Text annotations "
                               "(font, size, bold, italic and etc)")),
             'metadata':
             fields.Nested(ParagraphMetadata.get_api_dict(api),
                           skip_none=True,
                           allow_null=False,
                           description="Paragraph meta information"),
             'subparagraphs':
             fields.List(
                 fields.Nested(api.model('others_TreeNode', {})),
                 description=
                 "Node childes (with type 'TreeNode') of structure tree")
             if depth == get_config()['recursion_deep_subparagraphs'] else
             fields.List(
                 fields.Nested(
                     TreeNode.get_api_dict(
                         api,
                         depth=depth +
                         1,
                         name='refTreeNode' + str(depth))),
                 description=
                 "Node childes (with type 'TreeNode') of structure tree")
         })
Exemple #11
0
 def read(
     self,
     path: str,
     document_type: Optional[str] = None,
     parameters: Optional[dict] = None
 ) -> Tuple[UnstructuredDocument, bool]:
     lines = []
     for line_id, line in self._get_lines(path):
         metadata = ParagraphMetadata(page_id=0,
                                      line_id=line_id,
                                      predicted_classes=None,
                                      paragraph_type="raw_text")
         line_with_meta = LineWithMeta(line=line,
                                       hierarchy_level=None,
                                       metadata=metadata,
                                       annotations=[])
         lines.append(line_with_meta)
     lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
     return UnstructuredDocument(lines=lines, tables=[]), False
Exemple #12
0
 def create(texts: Iterable[str]) -> "TreeNode":
     """
     Creates a root node with given text
     :param texts: this text should be the title of the document (or should be empty for documents without title)
     :return: root of the document tree
     """
     text = "\n".join(texts)
     metadata = ParagraphMetadata(paragraph_type="root",
                                  page_id=0,
                                  line_id=0,
                                  predicted_classes=None)
     hierarchy_level = HierarchyLevel(0, 0, True, paragraph_type="root")
     return TreeNode("0",
                     text,
                     annotations=[],
                     metadata=metadata,
                     subparagraphs=[],
                     hierarchy_level=hierarchy_level,
                     parent=None)
Exemple #13
0
 def __handle_one_element(self, depth: int, value, paragraph_type: str,
                          paragraph_type_meta):
     if depth == 1 and paragraph_type == "title":
         level1 = 0
         level2 = 0
     else:
         level1 = depth
         level2 = 1
     hierarchy_level = HierarchyLevel(level_1=level1,
                                      level_2=level2,
                                      can_be_multiline=False,
                                      paragraph_type=paragraph_type_meta)
     metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                  predicted_classes=None,
                                  page_id=0,
                                  line_id=None)
     line = LineWithMeta(line=self.__get_text(value),
                         hierarchy_level=hierarchy_level,
                         metadata=metadata,
                         annotations=[])
     return line
Exemple #14
0
    def _get_lines_with_meta(
        self, hierarchy_level_extractor: HierarchyLevelExtractor
    ) -> List[LineWithMeta]:
        """
        :param paragraph_list: list of Paragraph
        :return: list of LineWithMeta
        """
        lines_with_meta = []
        paragraph_id = 0

        for i, paragraph in enumerate(self.paragraph_list):
            # line with meta:
            # {"text": "",
            #  "type": ""("paragraph" ,"list_item", "raw_text", "style_header"),
            #  "level": (1,1) or None (hierarchy_level),
            #  "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]}
            paragraph_properties = ParagraphInfo(paragraph)
            line_with_meta = paragraph_properties.get_info()

            text = line_with_meta["text"]

            paragraph_type = line_with_meta["type"]
            level = line_with_meta["level"]
            if level:
                hierarchy_level = HierarchyLevel(level[0], level[1], False,
                                                 paragraph_type)
            else:
                hierarchy_level = HierarchyLevel.create_raw_text()

            dict2annotations = {
                "bold": BoldAnnotation,
                "italic": ItalicAnnotation,
                "underlined": UnderlinedAnnotation,
                "size": SizeAnnotation,
                "indentation": IndentationAnnotation,
                "alignment": AlignmentAnnotation,
                "style": StyleAnnotation,
            }

            annotations = []
            for annotation in line_with_meta["annotations"]:
                annotations.append(
                    dict2annotations[annotation[0]](*annotation[1:]))

            for object_dict in [self.image_refs, self.diagram_refs]:
                if i in object_dict:
                    for object_uid in object_dict[i]:
                        annotation = AttachAnnotation(attach_uid=object_uid,
                                                      start=0,
                                                      end=len(text))
                        annotations.append(annotation)

            if i in self.table_refs:
                for table_uid in self.table_refs[i]:
                    annotation = TableAnnotation(name=table_uid,
                                                 start=0,
                                                 end=len(text))
                    annotations.append(annotation)

            paragraph_id += 1
            metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                         predicted_classes=None,
                                         page_id=0,
                                         line_id=paragraph_id)

            lines_with_meta.append(
                LineWithMeta(line=text,
                             hierarchy_level=hierarchy_level,
                             metadata=metadata,
                             annotations=annotations,
                             uid=paragraph.uid))
            lines_with_meta = hierarchy_level_extractor.get_hierarchy_level(
                lines_with_meta)
        return lines_with_meta
Exemple #15
0
    ["1", "Ivanov", "Ivan", "ISP RAS", "8-800"],
]
# table also has some metadata, lets assume that our table is on first page
table_metadata = TableMetadata(page_id=0)

# finally lets build table
table = Table(cells=table_cells, metadata=table_metadata)

# Documents also contain some text.
# Logical structure of document may be represented by tree (see  example_tree.png)
# but unstructured document consist of flat list of lines with text and metadata
# hierarchy structure hidden in HierarchyLevel attribute of LineWithMeta
# lets build firs line, it is document tree root:
text = "DOCUMENT TITLE"
metadata = ParagraphMetadata(paragraph_type="title",
                             predicted_classes=None,
                             page_id=0,
                             line_id=0)
# hierarchy level define position of this line in document tree.

hierarchy_level = HierarchyLevel(
    # most important parameters of HierarchyLevel is level_1 and level_2
    # hierarchy level compares by tuple (level_1, level_2) lesser -> closer to the root of the tree
    level_1=0,
    level_2=0,
    # can_be_multiline and paragraph_type - some parts of the document (for example title) may take more
    # than one line
    # if can_be_multiline is true than several lines in a row with same level_1, level_2 and paragraph_type
    # will be merged in one tree node
    can_be_multiline=True,
    paragraph_type="title")