Example #1
0
class RawTextReader(BaseReader):
    def __init__(self):
        self.hierarchy_level_extractor = HierarchyLevelExtractor()

    def _get_lines(self, path: str) -> Iterable[Tuple[int, str]]:
        with codecs.open(path, errors="ignore", encoding="utf-8-sig") as file:
            for line_id, line in enumerate(file):
                line = normalize('NFC', line).replace("й",
                                                      "й")  # й replace matter
                yield line_id, line

    def read(
        self,
        path: str,
        document_type: Optional[str] = None,
        parameters: Optional[dict] = None
    ) -> Tuple[UnstructuredDocument, bool]:
        lines = []
        for line_id, line in self._get_lines(path):
            metadata = ParagraphMetadata(page_id=0,
                                         line_id=line_id,
                                         predicted_classes=None,
                                         paragraph_type="raw_text")
            line_with_meta = LineWithMeta(line=line,
                                          hierarchy_level=None,
                                          metadata=metadata,
                                          annotations=[])
            lines.append(line_with_meta)
        lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
        return UnstructuredDocument(lines=lines, tables=[]), False

    def can_read(self, path: str, mime: str, extension: str,
                 document_type: Optional[str]) -> bool:
        return extension.endswith(".txt") and not document_type
Example #2
0
class PptxReader(BaseReader):
    def __init__(self):
        self.hierarchy_level_extractor = HierarchyLevelExtractor()

    def can_read(self, path: str, mime: str, extension: str, document_type: str) -> bool:
        return extension in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format

    def read(self,
             path: str,
             document_type: Optional[str] = None,
             parameters: Optional[dict] = None) -> Tuple[UnstructuredDocument, bool]:
        prs = Presentation(path)
        lines, tables = [], []

        for page_id, slide in enumerate(prs.slides, start=1):
            for paragraph_id, shape in enumerate(slide.shapes, start=1):

                if shape.has_text_frame:
                    metadata = ParagraphMetadata(paragraph_type="raw_text",
                                                 predicted_classes=None,
                                                 page_id=page_id,
                                                 line_id=paragraph_id)
                    lines.append(LineWithMeta(line=shape.text, hierarchy_level=None, metadata=metadata, annotations=[]))

                if shape.has_table:
                    cells = [[cell.text for cell in row.cells] for row in shape.table.rows]
                    metadata = TableMetadata(page_id=page_id)
                    tables.append(Table(cells=cells, metadata=metadata))

        lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
        return UnstructuredDocument(lines=lines, tables=tables), True
Example #3
0
    def _get_lines_with_meta(
        self, hierarchy_level_extractor: HierarchyLevelExtractor
    ) -> List[LineWithMeta]:
        """
        :param paragraph_list: list of Paragraph
        :return: list of LineWithMeta
        """
        lines_with_meta = []
        paragraph_id = 0

        for i, paragraph in enumerate(self.paragraph_list):
            # line with meta:
            # {"text": "",
            #  "type": ""("paragraph" ,"list_item", "raw_text", "style_header"),
            #  "level": (1,1) or None (hierarchy_level),
            #  "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]}
            paragraph_properties = ParagraphInfo(paragraph)
            line_with_meta = paragraph_properties.get_info()

            text = line_with_meta["text"]

            paragraph_type = line_with_meta["type"]
            level = line_with_meta["level"]
            if level:
                hierarchy_level = HierarchyLevel(level[0], level[1], False,
                                                 paragraph_type)
            else:
                hierarchy_level = HierarchyLevel.create_raw_text()

            dict2annotations = {
                "bold": BoldAnnotation,
                "italic": ItalicAnnotation,
                "underlined": UnderlinedAnnotation,
                "size": SizeAnnotation,
                "indentation": IndentationAnnotation,
                "alignment": AlignmentAnnotation,
                "style": StyleAnnotation,
            }

            annotations = []
            for annotation in line_with_meta["annotations"]:
                annotations.append(
                    dict2annotations[annotation[0]](*annotation[1:]))

            for object_dict in [self.image_refs, self.diagram_refs]:
                if i in object_dict:
                    for object_uid in object_dict[i]:
                        annotation = AttachAnnotation(attach_uid=object_uid,
                                                      start=0,
                                                      end=len(text))
                        annotations.append(annotation)

            if i in self.table_refs:
                for table_uid in self.table_refs[i]:
                    annotation = TableAnnotation(name=table_uid,
                                                 start=0,
                                                 end=len(text))
                    annotations.append(annotation)

            paragraph_id += 1
            metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                         predicted_classes=None,
                                         page_id=0,
                                         line_id=paragraph_id)

            lines_with_meta.append(
                LineWithMeta(line=text,
                             hierarchy_level=hierarchy_level,
                             metadata=metadata,
                             annotations=annotations,
                             uid=paragraph.uid))
            lines_with_meta = hierarchy_level_extractor.get_hierarchy_level(
                lines_with_meta)
        return lines_with_meta
Example #4
0
 def __init__(self):
     self.hierarchy_level_extractor = HierarchyLevelExtractor()
Example #5
0
 def __init__(self):
     self.hierarchy_level_extractor = HierarchyLevelExtractor()
     self.document_xml = None
     self.document_bs_tree = None
     self.paragraph_list = None
     self.path_hash = None
Example #6
0
class DocxReader(BaseReader):
    def __init__(self):
        self.hierarchy_level_extractor = HierarchyLevelExtractor()
        self.document_xml = None
        self.document_bs_tree = None
        self.paragraph_list = None
        self.path_hash = None

    def can_read(self, path: str, mime: str, extension: str,
                 document_type: Optional[str]) -> bool:
        return ((extension in recognized_extensions.docx_like_format
                 or mime in recognized_mimes.docx_like_format)
                and not document_type)

    def read(
        self,
        path: str,
        document_type: Optional[str] = None,
        parameters: Optional[dict] = None
    ) -> Tuple[UnstructuredDocument, bool]:

        # extract tables
        try:
            document = Document(path)
            tables = [self._process_table(table) for table in document.tables]
        except IndexError:
            tables = []
        except PackageNotFoundError:
            tables = []

        # get hash of document
        with open(path, "rb") as f:
            self.path_hash = hashlib.md5(f.read()).hexdigest()
        # extract text lines
        lines = self._process_lines(path)

        return UnstructuredDocument(lines=lines, tables=tables), True

    @property
    def get_paragraph_list(self) -> List[BeautifulSoup]:
        return self.paragraph_list

    @property
    def get_document_bs_tree(self) -> BeautifulSoup:
        return self.document_bs_tree

    @staticmethod
    def _process_table(table: DocxTable) -> Table:
        cells = [[cell.text for cell in row.cells] for row in table.rows]
        metadata = TableMetadata(page_id=None)
        return Table(cells=cells, metadata=metadata)

    def _process_lines(self, path: str) -> List[LineWithMeta]:
        """
        :param path: path to file for parsing
        :return: list of document lines with annotations
        """
        self.document_xml = zipfile.ZipFile(path)
        self.document_bs_tree = self.__get_bs_tree('word/document.xml')
        self.paragraph_list = []
        if self.document_bs_tree:
            body = self.document_bs_tree.body
        else:
            return []

        styles_extractor = StylesExtractor(
            self.__get_bs_tree('word/styles.xml'))
        num_tree = self.__get_bs_tree('word/numbering.xml')
        if num_tree:
            numbering_extractor = NumberingExtractor(num_tree,
                                                     styles_extractor)
        else:
            numbering_extractor = None
        styles_extractor.numbering_extractor = numbering_extractor

        footers, headers = [], []
        for i in range(1, 4):
            footer = self.__get_bs_tree('word/footer' + str(i) + '.xml')
            if footer:
                footers.append(footer)
            header = self.__get_bs_tree('word/header' + str(i) + '.xml')
            if header:
                headers.append(header)
        footnotes = self.__get_bs_tree('word/footnotes.xml')
        endnotes = self.__get_bs_tree('word/endnotes.xml')

        # the list of paragraph with their properties
        for header in headers:
            self.__add_to_paragraph_list(header)

        for paragraph in body:
            # ignore tables
            if paragraph.name == 'tbl':
                continue
            if paragraph.name != 'p':
                self.__add_to_paragraph_list(paragraph)
                continue
            self.paragraph_list.append(paragraph)

        if footnotes:
            self.__add_to_paragraph_list(footnotes)
        if endnotes:
            self.__add_to_paragraph_list(endnotes)
        for footer in footers:
            self.__add_to_paragraph_list(footer)

        paragraph_list = []
        for paragraph in self.paragraph_list:
            paragraph_list.append(
                Paragraph(paragraph, styles_extractor, numbering_extractor))

        return self._get_lines_with_meta(paragraph_list)

    def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]:
        """
        gets xml bs tree from the given file inside the self.document_xml
        :param filename: name of file to extract the tree
        :return: BeautifulSoup tree or None if file wasn't found
        """
        try:
            tree = BeautifulSoup(self.document_xml.read(filename), 'xml')
        except KeyError:
            tree = None
        return tree

    def __add_to_paragraph_list(self, tree: BeautifulSoup) -> None:
        self.paragraph_list += tree.find_all('w:p')

    def _get_lines_with_meta(
            self, paragraph_list: List[Paragraph]) -> List[LineWithMeta]:
        """
        :param paragraph_list: list of Paragraph
        :return: list of LineWithMeta
        """
        lines_with_meta = []
        paragraph_id = 0

        for paragraph in paragraph_list:

            # line with meta:
            # {"text": "",
            #  "type": ""("paragraph" ,"list_item", "raw_text", "style_header"),
            #  "level": (1,1) or None (hierarchy_level),
            #  "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]}
            paragraph_properties = ParagraphInfo(paragraph)
            line_with_meta = paragraph_properties.get_info()

            text = line_with_meta["text"]
            uid = '{}_{}'.format(self.path_hash, line_with_meta["uid"])

            paragraph_type = line_with_meta["type"]
            level = line_with_meta["level"]
            if level:
                hierarchy_level = HierarchyLevel(level[0], level[1], False,
                                                 paragraph_type)
            else:
                hierarchy_level = HierarchyLevel(None, None, False, "raw_text")

            dict2annotations = {
                "bold": BoldAnnotation,
                "italic": ItalicAnnotation,
                "underlined": UnderlinedAnnotation,
                "size": SizeAnnotation,
                "indentation": IndentationAnnotation,
                "alignment": AlignmentAnnotation,
                "style": StyleAnnotation
            }
            annotations = []
            for annotation in line_with_meta["annotations"]:
                annotations.append(
                    dict2annotations[annotation[0]](*annotation[1:]))

            paragraph_id += 1
            metadata = ParagraphMetadata(paragraph_type=paragraph_type,
                                         predicted_classes=None,
                                         page_id=0,
                                         line_id=paragraph_id)

            lines_with_meta.append(
                LineWithMeta(line=text,
                             hierarchy_level=hierarchy_level,
                             metadata=metadata,
                             annotations=annotations,
                             uid=uid))
            lines_with_meta = self.hierarchy_level_extractor.get_hierarchy_level(
                lines_with_meta)
        return lines_with_meta
Example #7
0
 def __init__(self):
     self.hierarchy_level_extractor = HierarchyLevelExtractor()
     self.attachment_extractor = DocxAttachmentsExtractor()