Example #1
0
 def read(self,
          path: str,
          document_type: Optional[str] = None,
          parameters: Optional[dict] = None) -> UnstructuredDocument:
     lines = self._get_lines_with_meta(path)
     lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
     return UnstructuredDocument(lines=lines, tables=[], attachments=[])
Example #2
0
    def read(self,
             path: str,
             document_type: Optional[str] = None,
             parameters: Optional[dict] = None) -> UnstructuredDocument:
        prs = Presentation(path)
        lines, tables = [], []

        for page_id, slide in enumerate(prs.slides, start=1):
            for paragraph_id, shape in enumerate(slide.shapes, start=1):

                if shape.has_text_frame:
                    metadata = ParagraphMetadata(paragraph_type="raw_text",
                                                 predicted_classes=None,
                                                 page_id=page_id,
                                                 line_id=paragraph_id)
                    lines.append(
                        LineWithMeta(line=shape.text,
                                     hierarchy_level=None,
                                     metadata=metadata,
                                     annotations=[]))

                if shape.has_table:
                    cells = [[cell.text for cell in row.cells]
                             for row in shape.table.rows]
                    metadata = TableMetadata(page_id=page_id)
                    tables.append(Table(cells=cells, metadata=metadata))

        lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
        return UnstructuredDocument(lines=lines, tables=tables, attachments=[])
Example #3
0
    def read(
        self,
        path: str,
        document_type: Optional[str] = None,
        parameters: Optional[dict] = None
    ) -> Tuple[UnstructuredDocument, bool]:
        with open(path) as file:
            json_data = json.load(file)
        stack = [(json_data, 1)]
        result = []
        while len(stack) > 0:
            element, depth = stack.pop()
            if isinstance(element, dict) and len(element) > 0:
                self.__handle_dict(depth, element, result, stack)

            if isinstance(element, list) and len(element) > 0:
                self.__handle_list(depth, element, result, stack)
            elif self.__is_flat(element):
                line = self.__handle_one_element(
                    depth=depth,
                    value=str(element),
                    paragraph_type=HierarchyLevel.raw_text,
                    paragraph_type_meta=HierarchyLevel.raw_text)
                result.append(line)

        return UnstructuredDocument(tables=[], lines=result), False
Example #4
0
 def insert_table(self, document: UnstructuredDocument) -> UnstructuredDocument:
     """
     takes a document as the input and insert table cells into the paragraphs list.
     Does not insert table if it already was inserted
     """
     tables_dict = {table.metadata.uid: table for table in document.tables if not table.metadata.is_inserted}
     paragraphs = []
     hierarchy_level = max((line.hierarchy_level.level_1 for line in document.lines
                            if not line.hierarchy_level.is_raw_text()), default=0)
     hierarchy_level_raw_text = HierarchyLevel(level_1=hierarchy_level + 1,
                                               level_2=0,
                                               can_be_multiline=True,
                                               paragraph_type=HierarchyLevel.raw_text)
     for line in document.lines:
         if line.hierarchy_level.is_raw_text():
             line.set_hierarchy_level(hierarchy_level_raw_text)
         paragraphs.append(line)
         for annotation in line.annotations:
             if annotation.name == TableAnnotation.name:
                 table_id = annotation.value
                 if table_id in tables_dict:
                     table = tables_dict[table_id]
                     paragraphs += self._create_paragraphs_from_table(table=table, hierarchy_level=hierarchy_level)
                     tables_dict.pop(table_id)
     for table in tables_dict.values():
         paragraphs += self._create_paragraphs_from_table(table=table, hierarchy_level=hierarchy_level)
     return UnstructuredDocument(lines=paragraphs, tables=document.tables, attachments=document.attachments)
Example #5
0
 def read(
     self,
     path: str,
     document_type: Optional[str] = None,
     parameters: Optional[dict] = None
 ) -> Tuple[UnstructuredDocument, bool]:
     with xlrd.open_workbook(path) as book:
         sheets_num = book.nsheets
         tables = []
         for sheet_num in range(sheets_num):
             sheet = book.sheet_by_index(sheet_num)
             tables.append(self.__parse_sheet(sheet_num, sheet))
         return UnstructuredDocument(lines=[], tables=tables), True
Example #6
0
 def read(self,
          path: str,
          document_type: Optional[str] = None,
          parameters: Optional[dict] = None) -> UnstructuredDocument:
     delimiter = parameters.get("delimiter")
     if delimiter is None:
         delimiter = "\t" if path.endswith(
             ".tsv") else self.default_separator
     with open(path, errors="ignore") as file:
         csv_reader = csv.reader(file, delimiter=delimiter)
         data = list(csv_reader)
     table_metadata = TableMetadata(page_id=0)
     tables = [Table(cells=data, metadata=table_metadata)]
     return UnstructuredDocument(lines=[], tables=tables, attachments=[])
Example #7
0
    def read(self,
             path: str,
             document_type: Optional[str] = None,
             parameters: Optional[dict] = None) -> UnstructuredDocument:
        docx_document = self._parse_document(path=path)
        attachments = self.attachment_extractor.get_attachments(
            tmpdir=os.path.dirname(path),
            filename=os.path.basename(path),
            parameters=parameters)

        lines = self.__fix_lines(docx_document.lines)
        return UnstructuredDocument(lines=lines,
                                    tables=docx_document.tables,
                                    attachments=attachments)
Example #8
0
 def read(
     self,
     path: str,
     document_type: Optional[str] = None,
     parameters: Optional[dict] = None
 ) -> Tuple[UnstructuredDocument, bool]:
     lines = []
     for line_id, line in self._get_lines(path):
         metadata = ParagraphMetadata(page_id=0,
                                      line_id=line_id,
                                      predicted_classes=None,
                                      paragraph_type="raw_text")
         line_with_meta = LineWithMeta(line=line,
                                       hierarchy_level=None,
                                       metadata=metadata,
                                       annotations=[])
         lines.append(line_with_meta)
     lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
     return UnstructuredDocument(lines=lines, tables=[]), False
Example #9
0
 def read(self,
          path: str,
          document_type: Optional[str] = None,
          parameters: Optional[dict] = None) -> UnstructuredDocument:
     with xlrd.open_workbook(path) as book:
         sheets_num = book.nsheets
         tables = []
         for sheet_num in range(sheets_num):
             sheet = book.sheet_by_index(sheet_num)
             tables.append(self.__parse_sheet(sheet_num, sheet))
         if self.attachment_extractor.with_attachments(
                 parameters=parameters):
             attachments = self.attachment_extractor.get_attachments(
                 tmpdir=os.path.dirname(path),
                 filename=os.path.basename(path),
                 parameters=parameters)
         else:
             attachments = []
         return UnstructuredDocument(lines=[],
                                     tables=tables,
                                     attachments=attachments)
Example #10
0
    def read(
        self,
        path: str,
        document_type: Optional[str] = None,
        parameters: Optional[dict] = None
    ) -> Tuple[UnstructuredDocument, bool]:

        # extract tables
        try:
            document = Document(path)
            tables = [self._process_table(table) for table in document.tables]
        except IndexError:
            tables = []
        except PackageNotFoundError:
            tables = []

        # get hash of document
        with open(path, "rb") as f:
            self.path_hash = hashlib.md5(f.read()).hexdigest()
        # extract text lines
        lines = self._process_lines(path)

        return UnstructuredDocument(lines=lines, tables=tables), True