Example #1
0
    def read(self,
             path: str,
             document_type: Optional[str] = None,
             parameters: Optional[dict] = None) -> UnstructuredDocument:
        prs = Presentation(path)
        lines, tables = [], []

        for page_id, slide in enumerate(prs.slides, start=1):
            for paragraph_id, shape in enumerate(slide.shapes, start=1):

                if shape.has_text_frame:
                    metadata = ParagraphMetadata(paragraph_type="raw_text",
                                                 predicted_classes=None,
                                                 page_id=page_id,
                                                 line_id=paragraph_id)
                    lines.append(
                        LineWithMeta(line=shape.text,
                                     hierarchy_level=None,
                                     metadata=metadata,
                                     annotations=[]))

                if shape.has_table:
                    cells = [[cell.text for cell in row.cells]
                             for row in shape.table.rows]
                    metadata = TableMetadata(page_id=page_id)
                    tables.append(Table(cells=cells, metadata=metadata))

        lines = self.hierarchy_level_extractor.get_hierarchy_level(lines)
        return UnstructuredDocument(lines=lines, tables=tables, attachments=[])
Example #2
0
 def get_api_dict(api: Api) -> Model:
     return api.model('Table', {
         'cells': fields.List(fields.List(fields.String(description="Cell contains text")),
                              description="matrix of cells"),
         'metadata': fields.Nested(TableMetadata.get_api_dict(api),
                                   readonly=True,
                                   description='Table meta information')
     })
Example #3
0
 def _handle_table_xml(self, paragraph_xml: BeautifulSoup):
     table = DocxTable(paragraph_xml, self.styles_extractor)
     metadata = TableMetadata(page_id=None, uid=table.uid)
     self.tables.append(Table(cells=table.get_cells(), metadata=metadata))
     table_uid = table.uid
     if not self.paragraph_list:
         empty_paragraph_xml = BeautifulSoup('<w:p></w:p>').body.contents[0]
         empty_paragraph = self.__xml2paragraph(empty_paragraph_xml)
         self.paragraph_list.append(empty_paragraph)
     self.table_refs[len(self.paragraph_list) - 1].append(table_uid)
Example #4
0
 def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table:
     n_rows = sheet.nrows
     n_cols = sheet.ncols
     res = []
     for row_id in range(n_rows):
         row = []
         for col_id in range(n_cols):
             value = sheet.cell_value(rowx=row_id, colx=col_id)
             row.append(value)
         res.append(row)
     metadata = TableMetadata(page_id=sheet_id)
     return Table(cells=res, metadata=metadata)
Example #5
0
 def read(self,
          path: str,
          document_type: Optional[str] = None,
          parameters: Optional[dict] = None) -> UnstructuredDocument:
     delimiter = parameters.get("delimiter")
     if delimiter is None:
         delimiter = "\t" if path.endswith(
             ".tsv") else self.default_separator
     with open(path, errors="ignore") as file:
         csv_reader = csv.reader(file, delimiter=delimiter)
         data = list(csv_reader)
     table_metadata = TableMetadata(page_id=0)
     tables = [Table(cells=data, metadata=table_metadata)]
     return UnstructuredDocument(lines=[], tables=tables, attachments=[])
Example #6
0
#  in this example we create UnstructuredDocument, lets construct document corresponding to example.docx
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.paragraph_metadata import ParagraphMetadata
from dedoc.data_structures.table import Table
from dedoc.data_structures.table_metadata import TableMetadata

#  First of all lets create some table, table consist of cells (list of rows, and row is a list of strings
from dedoc.structure_parser.heirarchy_level import HierarchyLevel

table_cells = [
    ["N", "Second name", "Name", "Organization", "Phone", "Notes"],
    ["1", "Ivanov", "Ivan", "ISP RAS", "8-800"],
]
# table also has some metadata, lets assume that our table is on first page
table_metadata = TableMetadata(page_id=0)

# finally lets build table
table = Table(cells=table_cells, metadata=table_metadata)

# Documents also contain some text.
# Logical structure of document may be represented by tree (see  example_tree.png)
# but unstructured document consist of flat list of lines with text and metadata
# hierarchy structure hidden in HierarchyLevel attribute of LineWithMeta
# lets build firs line, it is document tree root:
text = "DOCUMENT TITLE"
metadata = ParagraphMetadata(paragraph_type="title",
                             predicted_classes=None,
                             page_id=0,
                             line_id=0)
# hierarchy level define position of this line in document tree.
Example #7
0
 def _process_table(table: DocxTable) -> Table:
     cells = [[cell.text for cell in row.cells] for row in table.rows]
     metadata = TableMetadata(page_id=None)
     return Table(cells=cells, metadata=metadata)