Beispiel #1
0
    def test_root_annotations(self):
        lines = [
            LineWithMeta(
                line="bold text\n",
                hierarchy_level=HierarchyLevel.create_root(),
                metadata=ParagraphMetadata(paragraph_type="root",
                                           predicted_classes=None,
                                           page_id=0,
                                           line_id=0),
                annotations=[BoldAnnotation(start=0, end=10, value="True")]),
            LineWithMeta(
                line="italic text\n",
                hierarchy_level=HierarchyLevel.create_root(),
                metadata=ParagraphMetadata(paragraph_type="root",
                                           predicted_classes=None,
                                           page_id=0,
                                           line_id=1),
                annotations=[ItalicAnnotation(start=0, end=12, value="True")]),
        ]
        node = TreeNode.create(lines=lines)
        node_annotations = node.get_root().annotations
        node_annotations.sort(key=lambda a: a.start)
        self.assertEqual(2, len(node_annotations))
        bold, italic = node_annotations
        self.assertEqual(BoldAnnotation.name, bold.name)
        self.assertEqual("True", bold.value)
        self.assertEqual(0, bold.start)
        self.assertEqual(10, bold.end)

        self.assertEqual(ItalicAnnotation.name, italic.name)
        self.assertEqual("True", italic.value)
        self.assertEqual(10, italic.start)
        self.assertEqual(22, italic.end)
Beispiel #2
0
    def create(lines: List[LineWithMeta] = None) -> "TreeNode":
        """
        Creates a root node with given text
        :param lines: this lines should be the title of the document (or should be empty for documents without title)
        :return: root of the document tree
        """
        page_id = 0 if len(lines) == 0 else min(
            (line.metadata.page_id for line in lines))
        line_id = 0 if len(lines) == 0 else min(
            (line.metadata.line_id for line in lines))

        texts = (line.line for line in lines)
        annotations = []
        text_length = 0
        for line in lines:
            annotations.extend(
                TreeNode.__shift_annotations(line=line,
                                             text_length=text_length))
            text_length += len(line.line)
        text = "".join(texts)
        metadata = ParagraphMetadata(paragraph_type="root",
                                     page_id=page_id,
                                     line_id=line_id,
                                     predicted_classes=None)
        return TreeNode("0",
                        text,
                        annotations=annotations,
                        metadata=metadata,
                        subparagraphs=[],
                        hierarchy_level=HierarchyLevel.create_root(),
                        parent=None)