Esempio n. 1
0
 def test_two_raw_text(self):
     h1 = HierarchyLevel.create_raw_text()
     h2 = HierarchyLevel.create_raw_text()
     h3 = HierarchyLevel(level_1=1, level_2=2, can_be_multiline=False, paragraph_type="raw_text")
     self.assertTrue(h1 == h2)
     self.assertTrue(h1 >= h2)
     self.assertTrue(h1 <= h2)
     self.assertTrue(h1 == h3)
     self.assertTrue(h1 >= h3)
     self.assertTrue(h1 <= h3)
Esempio n. 2
0
 def test_raw_text_greater_than_any_other(self):
     list_item = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, paragraph_type="list_item")
     raw_text = HierarchyLevel.create_raw_text()
     self.assertFalse(list_item > raw_text)
     self.assertFalse(list_item >= raw_text)
     self.assertFalse(list_item == raw_text)
     self.assertTrue(list_item < raw_text)
     self.assertTrue(list_item <= raw_text)
Esempio n. 3
0
    def _get_lines_with_meta(
        self, hierarchy_level_extractor: HierarchyLevelExtractor
    ) -> List[LineWithMeta]:
        """
        :param paragraph_list: list of Paragraph
        :return: list of LineWithMeta
        """
        lines_with_meta = []
        paragraph_id = 0

        for i, paragraph in enumerate(self.paragraph_list):
            # line with meta:
            # {"text": "",
            #  "type": ""("paragraph" ,"list_item", "raw_text", "style_header"),
            #  "level": (1,1) or None (hierarchy_level),
            #  "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]}
            paragraph_properties = ParagraphInfo(paragraph)
            line_with_meta = paragraph_properties.get_info()

            text = line_with_meta["text"]

            paragraph_type = line_with_meta["type"]
            level = line_with_meta["level"]
            if level:
                hierarchy_level = HierarchyLevel(level[0], level[1], False,
                                                 paragraph_type)
            else:
                hierarchy_level = HierarchyLevel.create_raw_text()

            dict2annotations = {
                "bold": BoldAnnotation,
                "italic": ItalicAnnotation,
                "underlined": UnderlinedAnnotation,
                "size": SizeAnnotation,
                "indentation": IndentationAnnotation,
                "alignment": AlignmentAnnotation,
                "style": StyleAnnotation,
            }

            annotations = []
            for annotation in line_with_meta["annotations"]:
                annotations.append(
                    dict2annotations[annotation[0]](*annotation[1:]))

            for object_dict in [self.image_refs, self.diagram_refs]:
                if i in object_dict:
                    for object_uid in object_dict[i]:
                        annotation = AttachAnnotation(attach_uid=object_uid,
                                                      start=0,
                                                      end=len(text))
                        annotations.append(annotation)

            if i in self.table_refs:
                for table_uid in self.table_refs[i]:
                    annotation = TableAnnotation(name=table_uid,
                                                 start=0,
                                                 end=len(text))
                    annotations.append(annotation)

            paragraph_id += 1
            metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                         predicted_classes=None,
                                         page_id=0,
                                         line_id=paragraph_id)

            lines_with_meta.append(
                LineWithMeta(line=text,
                             hierarchy_level=hierarchy_level,
                             metadata=metadata,
                             annotations=annotations,
                             uid=paragraph.uid))
            lines_with_meta = hierarchy_level_extractor.get_hierarchy_level(
                lines_with_meta)
        return lines_with_meta