Ejemplo n.º 1
0
def inference_result_to_boxes(
    inference_page_result: List[Dict[str, Any]]
) -> Tuple[List[InferenceTable], List[Cell], List[BorderBox]]:
    raw_tables = [
        tag for tag in inference_page_result if tag["label"] in TABLE_TAGS
    ]
    raw_headers = [
        _raw_to_cell(tag) for tag in inference_page_result
        if tag["label"] == "Header"
    ]
    inference_tables: List[InferenceTable] = [
        _raw_to_table(raw_table) for raw_table in raw_tables
    ]

    filtered = _filter_double_detection(inference_tables)

    raw_cells = [
        _raw_to_cell(cell) for cell in inference_page_result
        if cell["label"] == CELL_TAG
    ]

    not_matched = match_cells_and_tables(raw_cells, filtered)

    if len(raw_cells) > 20 and not inference_tables:
        filtered.append(
            InferenceTable(
                bbox=BorderBox(
                    top_left_y=min([cell.top_left_y
                                    for cell in raw_cells]) - 50,
                    top_left_x=min([cell.top_left_x
                                    for cell in raw_cells]) - 50,
                    bottom_right_y=max(
                        [cell.bottom_right_y for cell in raw_cells]) + 50,
                    bottom_right_x=max(
                        [cell.bottom_right_x for cell in raw_cells]) + 50,
                ),
                confidence=0.5,
                label="Borderless",
                tags=raw_cells,
            ))
    if len(not_matched) > 20:
        filtered.append(
            InferenceTable(
                bbox=BorderBox(
                    top_left_y=min([cell.top_left_y
                                    for cell in not_matched]) - 50,
                    top_left_x=min([cell.top_left_x
                                    for cell in not_matched]) - 50,
                    bottom_right_y=max(
                        [cell.bottom_right_y for cell in not_matched]) + 50,
                    bottom_right_x=max(
                        [cell.bottom_right_x for cell in not_matched]) + 50,
                ),
                confidence=0.5,
                label="Borderless",
                tags=not_matched,
            ))

    return filtered, raw_headers, not_matched
Ejemplo n.º 2
0
def convert_cells(cells: dict) -> list:
    converted_cells = []
    for coords, params in cells.items():
        coords_in_px = params[0]
        text_boxes = TextField(
            bbox=BorderBox(
                coords_in_px["top_left"][0],
                coords_in_px["top_left"][1],
                coords_in_px["bottom_right"][0],
                coords_in_px["bottom_right"][1],
            ),
            text=params[-2],
        )
        new_cell = CellLinked(
            coords_in_px["top_left"][0],
            coords_in_px["top_left"][1],
            coords_in_px["bottom_right"][0],
            coords_in_px["bottom_right"][1],
            text_boxes=[text_boxes],
            col=coords[0],
            row=coords[1],
            col_span=params[1],
            row_span=params[2],
        )

        converted_cells.append(new_cell)
    return converted_cells
Ejemplo n.º 3
0
def bounding_box_to_bbox(bounding_box: PopplerBoundingBox, scale: float):
    return BorderBox(
        top_left_x=int(bounding_box.x * scale),
        top_left_y=int(bounding_box.y * scale),
        bottom_right_x=int((bounding_box.x + bounding_box.width) * scale),
        bottom_right_y=int((bounding_box.y + bounding_box.height) * scale)
    )
Ejemplo n.º 4
0
    def find_tables_in_boxes(self, min_rows=2) -> Optional[List[Table]]:
        tables = []
        h_lines = {}
        v_lines = {}

        for box in sorted(self.objs,
                          key=lambda x: (x.top_left_x, x.top_left_y)):
            for table in tables:
                if table.is_box_from_table(box):
                    target_table = table
                    break
            else:
                tables.append(Table(bbox=box, table_id=len(tables)))
                continue

            h_line_key = box[1]
            v_line_key = box[0]

            if (h_line_key not in h_lines
                    or h_lines[h_line_key].table_id != target_table.table_id):
                row = Row(
                    bbox=BorderBox(box[0], box[1], target_table.bbox[2],
                                   box[3]),
                    table_id=target_table.table_id,
                )
                row.add(box)
                target_table.rows.append(row)
                h_lines[h_line_key] = row
            else:
                h_lines[h_line_key].add(box)

            if (v_line_key not in v_lines
                    or v_lines[v_line_key].table_id != target_table.table_id):
                col = Column(
                    bbox=BorderBox(box[0], box[1], box[2],
                                   target_table.bbox[3]),
                    table_id=target_table.table_id,
                )
                col.add(box)
                target_table.cols.append(col)
                v_lines[v_line_key] = col
            else:
                v_lines[v_line_key].add(box)

        res = [i for i in tables if len(i.rows) >= min_rows]

        return res if res else None
Ejemplo n.º 5
0
 def extract_table_text(self, img: numpy.ndarray,
                        border_box: BorderBox) -> List[TextField]:
     x1, y1, x2, y2 = border_box.box
     dt_boxes, elapse = self.text_detector(img[y1:y2, x1:x2])
     bboxes = paddle_result_to_bboxes(dt_boxes)
     return [
         TextField(bbox=cell, text="")
         for cell in (BorderBox(b[0] + x1, b[1] + y1, b[2] + x1, b[3] + y1)
                      for b in bboxes)
     ]
Ejemplo n.º 6
0
def _raw_to_table(raw_table: Dict[str, Any]) -> InferenceTable:
    top_left_x, top_left_y, bottom_right_x, bottom_right_y = raw_table['bbox']
    return InferenceTable(
        bbox=BorderBox(top_left_y=top_left_y,
                       top_left_x=top_left_x,
                       bottom_right_y=bottom_right_y,
                       bottom_right_x=bottom_right_x),
        confidence=raw_table['score'],
        label=raw_table['label'],
    )
Ejemplo n.º 7
0
def _raw_to_table(raw_table: Dict[str, Any]) -> InferenceTable:
    top_left_x, top_left_y, bottom_right_x, bottom_right_y = raw_table["bbox"]
    return InferenceTable(
        bbox=BorderBox(
            top_left_y=top_left_y,
            top_left_x=top_left_x,
            bottom_right_y=bottom_right_y,
            bottom_right_x=bottom_right_x,
        ),
        confidence=raw_table["score"],
        label=raw_table["label"],
    )
Ejemplo n.º 8
0
def excel_to_structured(excel_table: dict) -> StructuredTable:
    """
    Converts data from excel to structured table
    """

    table = StructuredTable(
        cells=convert_cells(excel_table['cells']),
        bbox=BorderBox(
            excel_table['dimensions'][0]['top_left'][0],
            excel_table['dimensions'][0]['top_left'][1],
            excel_table['dimensions'][1]['bottom_right'][0],
            excel_table['dimensions'][1]['bottom_right'][1],
        ))

    return table
Ejemplo n.º 9
0
def excel_to_structured(excel_table: dict) -> StructuredTable:
    """
    Converts data from excel to structured table
    """

    table = StructuredTable(
        cells=convert_cells(excel_table["cells"]),
        bbox=BorderBox(
            excel_table["dimensions"][0]["top_left"][0],
            excel_table["dimensions"][0]["top_left"][1],
            excel_table["dimensions"][1]["bottom_right"][0],
            excel_table["dimensions"][1]["bottom_right"][1],
        ),
    )

    return table
Ejemplo n.º 10
0
def construct_rows_from_boxes(cells: List[Cell], x_max) -> List[Row]:
    h_lines = {}

    for box in sorted(cells, key=lambda x: (x.top_left_x, x.top_left_y)):
        h_line_key = box[1]
        if h_line_key not in h_lines:
            row = Row(
                bbox=BorderBox(box[0], box[1], x_max, box[3]),
                table_id=1,
            )
            row.add(box)
            h_lines[h_line_key] = row
        else:
            h_lines[h_line_key].add(box)

    return list(h_lines.values())
Ejemplo n.º 11
0
    def write(self):
        pages = []
        for i, (sheet, tables) in enumerate(self.tables_with_headers.items()):
            if not i:
                ws = self.wb.active
            else:
                ws = self.wb.create_sheet(sheet)

            for table in tables:
                for header_cells in table.header:
                    for cell in header_cells:
                        added_cell = ws.cell(
                            row=cell.row,
                            column=cell.col,
                            value=cell.text_boxes[0].text,
                        )
                        added_cell.fill = HEADER_FILL
                        added_cell.font = HEADER_FONT

                for cell in table.cells:
                    ws.cell(
                        row=cell.row,
                        column=cell.col,
                        value=cell.text_boxes[0].text,
                    )

            pages.append(
                page_to_dict(
                    Page(
                        page_num=i,
                        bbox=BorderBox(
                            top_left_x=0,
                            top_left_y=0,
                            bottom_right_x=max([
                                table.bbox.bottom_right_x for table in tables
                            ]),
                            bottom_right_y=max([
                                table.bbox.bottom_right_y for table in tables
                            ]),
                        ),
                        tables=tables,
                    )))

        self.wb.save(self.outpath)
        return pages
Ejemplo n.º 12
0
    def process_page(self, image_path: Path, output_path: Path,
                     poppler_page) -> Dict[str, Any]:
        img = cv2.imread(str(image_path.absolute()))
        page = Page(
            page_num=int(image_path.name.split(".")[0]),
            bbox=BorderBox(
                top_left_x=0,
                top_left_y=0,
                bottom_right_x=img.shape[1],
                bottom_right_y=img.shape[0],
            ),
        )
        text_fields = self._scale_poppler_result(img, output_path,
                                                 poppler_page, image_path)

        logger.info("Start inference")
        inference_tables, headers = self.inference_service.inference_image(
            image_path)
        logger.info("End inference")
        self.visualizer.draw_object_and_save(
            img,
            inference_tables,
            Path(f"{output_path}/inference_result/{image_path.name}"),
            headers=headers,
        )

        if inference_tables:
            logger.info("Start bordered")
            image = detect_tables_on_page(
                image_path, draw=self.visualizer.should_visualize)
            logger.info("End bordered")
            text_fields_to_match = text_fields
            bordered_tables = []
            if image.tables:
                for bordered_table in image.tables:
                    in_table, text_fields_to_match = match_table_text(
                        bordered_table, text_fields_to_match)
                    _ = match_cells_table(in_table, bordered_table)
                    bordered_tables.append(
                        semi_border_to_struct(bordered_table, img.shape))

            inf_tables_to_detect = []
            for inf_table in inference_tables:
                matched = False
                if image.tables:
                    for bordered_table in bordered_tables:
                        if (inf_table.bbox.box_is_inside_another(
                                bordered_table.bbox, 0.8)
                                and inf_table.label == "Bordered"
                                and len(bordered_table.cells) >
                                len(inf_table.tags) * 0.5):
                            matched = True
                            page.tables.append(bordered_table)
                if not matched:
                    inf_tables_to_detect.append(inf_table)

            semi_bordered_tables = []
            for inf_table in inf_tables_to_detect:
                in_inf_table, text_fields_to_match = match_table_text(
                    inf_table, text_fields_to_match)
                logger.info("Start paddle")
                paddle_fields = self.text_detector.extract_table_text(
                    img, inf_table.bbox)
                logger.info("End paddle")
                if paddle_fields:
                    in_inf_table = merge_text_fields(paddle_fields,
                                                     in_inf_table)

                mask_rcnn_count_matches, not_matched = match_cells_text_fields(
                    inf_table.tags, in_inf_table)

                if inf_table.label == "Borderless" and False:
                    semi_border = semi_bordered(img, inf_table)
                    if semi_border:
                        semi_bordered_tables.append(semi_border)
                        semi_border_score = match_cells_table(
                            in_inf_table, semi_border)
                        if (semi_border_score >= mask_rcnn_count_matches
                                and semi_border.count_cells() > len(
                                    inf_table.tags)):
                            struct_table = semi_border_to_struct(
                                semi_border, img.shape)
                            if struct_table:
                                page.tables.append(struct_table)
                            continue
                struct = self.extract_table_from_inference(
                    img, inf_table, not_matched, img.shape, image_path)
                if struct:
                    page.tables.append(struct)

            for table in page.tables:
                actualize_text(table, image_path, img.shape[:2])

            # TODO: Headers should be created only once
            cell_header_scores = []
            for table in page.tables:
                cell_header_scores.extend(
                    self.header_checker.get_cell_scores(table.cells))

            self.visualizer.draw_object_and_save(
                img,
                cell_header_scores,
                output_path / "cells_header" / f"{page.page_num}.png",
            )

            tables_with_header = []
            for table in page.tables:
                header_rows = self.create_header(table.rows, headers, 5)
                table_with_header = (
                    StructuredTableHeadered.from_structured_and_rows(
                        table, header_rows))
                header_cols = self.create_header(table.cols, headers, 1)
                # TODO: Cells should be actualized only once
                table_with_header.actualize_header_with_cols(header_cols)
                tables_with_header.append(table_with_header)
            page.tables = tables_with_header

            self.visualizer.draw_object_and_save(
                img,
                semi_bordered_tables,
                output_path.joinpath("semi_bordered_tables").joinpath(
                    image_path.name),
            )
            self.visualizer.draw_object_and_save(
                img,
                page.tables,
                output_path.joinpath("tables").joinpath(image_path.name),
            )
        logger.info("Start text extraction")
        with TextExtractor(str(image_path.absolute()),
                           seg_mode=PSM.SPARSE_TEXT) as extractor:
            text_borders = [1]
            for table in page.tables:
                _, y, _, y2 = table.bbox.box
                text_borders.extend([y, y2])
            text_borders.append(img.shape[0])
            text_candidate_boxes: List[BorderBox] = []
            for i in range(len(text_borders) // 2):
                if text_borders[i * 2 + 1] - text_borders[i * 2] > 3:
                    text_candidate_boxes.append(
                        BorderBox(
                            top_left_x=1,
                            top_left_y=text_borders[i * 2],
                            bottom_right_x=img.shape[1],
                            bottom_right_y=text_borders[i * 2 + 1],
                        ))
            for box in text_candidate_boxes:
                text, _ = extractor.extract(box.top_left_x, box.top_left_y,
                                            box.width, box.height)
                if text:
                    page.text.append(TextField(box, text))
        logger.info("End text extraction")
        page_dict = page_to_dict(page)
        if self.visualizer.should_visualize:
            save_page(page_dict,
                      output_path / "pages" / f"{page.page_num}.json")

        return page_dict
def reconstruct_table_from_grid(
        grid_table: GridTable,
        cells: List[Cell]) -> Tuple[Optional[StructuredTable], List[Cell]]:
    not_matched = []
    linked_cells = []
    grid_cells_dict = {}
    for g_cell in grid_table.cells:
        grid_cells_dict[g_cell.row * len(grid_table.cols) +
                        g_cell.col] = g_cell
    for cell in cells:
        rows = []
        for r_idx, row in enumerate(grid_table.rows):
            if row.box_is_inside_another(cell, 0.0):
                rows.append((r_idx, row))
        cols = []
        for c_idx, col in enumerate(grid_table.cols):
            if col.box_is_inside_another(cell, 0.0):
                cols.append((c_idx, col))
        if rows and cols:
            linked_cells.append(
                CellLinked(
                    top_left_y=rows[0][1].top_left_y,
                    top_left_x=cols[0][1].top_left_x,
                    bottom_right_y=rows[-1][1].bottom_right_y,
                    bottom_right_x=cols[-1][1].bottom_right_x,
                    row=rows[0][0],
                    col=cols[0][0],
                    row_span=len(rows),
                    col_span=len(cols),
                    text_boxes=cell.text_boxes,
                ))
            for row in rows:
                for col in cols:
                    if grid_cells_dict.get(row[0] * len(grid_table.cols) +
                                           col[0]):
                        _ = grid_cells_dict.pop(row[0] * len(grid_table.cols) +
                                                col[0])
        else:
            not_matched.append(cell)
    for _, g_cell in grid_cells_dict.items():
        linked_cells.append(
            CellLinked(
                top_left_y=g_cell.top_left_y,
                top_left_x=g_cell.top_left_x,
                bottom_right_y=g_cell.bottom_right_y,
                bottom_right_x=g_cell.bottom_right_x,
                row=g_cell.row,
                col=g_cell.col,
                row_span=1,
                col_span=1,
                text_boxes=[],
            ))
    if not grid_table.cols or not grid_table.rows or not grid_table.cells:
        return None, cells
    table = StructuredTable(
        bbox=BorderBox(
            top_left_y=grid_table.rows[0].top_left_y,
            top_left_x=grid_table.cols[0].top_left_x,
            bottom_right_y=grid_table.rows[-1].bottom_right_y,
            bottom_right_x=grid_table.cols[-1].bottom_right_x,
        ),
        cells=linked_cells,
    )
    return table, not_matched
Ejemplo n.º 14
0
    def process_page(self, image_path: Path, output_path: Path,
                     poppler_page) -> Dict[str, Any]:
        img = cv2.imread(str(image_path.absolute()))
        page = Page(page_num=int(image_path.name.split(".")[0]),
                    bbox=BorderBox(top_left_x=0,
                                   top_left_y=0,
                                   bottom_right_x=img.shape[1],
                                   bottom_right_y=img.shape[0]))
        text_fields = self._scale_poppler_result(img, output_path,
                                                 poppler_page, image_path)

        inference_tables, headers = self.inference_service.inference_image(
            image_path)
        if not inference_tables:
            return page_to_dict(page)

        has_bordered = any(
            [i_tab.label == 'Bordered' for i_tab in inference_tables])

        self.visualizer.draw_object_and_save(
            img, inference_tables,
            Path(f"{output_path}/inference_result/{image_path.name}"))

        text_fields_to_match = text_fields

        semi_bordered_tables = []
        detected_tables = []
        for inf_table in inference_tables:
            in_inf_table, text_fields_to_match = match_table_text(
                inf_table, text_fields_to_match)
            paddle_fields = self.text_detector.extract_table_text(
                img, inf_table.bbox)
            if paddle_fields:
                in_inf_table = merge_text_fields(paddle_fields, in_inf_table)

            mask_rcnn_count_matches, not_matched = match_cells_text_fields(
                inf_table.tags, in_inf_table)

            if inf_table.label == 'Borderless':
                semi_border = semi_bordered(img, inf_table)
                if semi_border:
                    semi_bordered_tables.append(semi_border)
                    semi_border_score = match_cells_table(
                        in_inf_table, semi_border)
                    if semi_border_score >= mask_rcnn_count_matches and semi_border.count_cells(
                    ) > len(inf_table.tags):
                        struct_table = semi_border_to_struct(
                            semi_border, img.shape)
                        if struct_table:
                            detected_tables.append(
                                (semi_border_score, struct_table))
                        continue
            struct = self.extract_table_from_inference(img, inf_table,
                                                       not_matched, img.shape,
                                                       image_path)
            if struct:
                detected_tables.append((mask_rcnn_count_matches, struct))

        if has_bordered or any(score < 0.2 * len(table.cells)
                               for score, table in detected_tables):
            image = detect_tables_on_page(
                image_path, draw=self.visualizer.should_visualize)
            if image.tables:
                text_fields_to_match = text_fields
                for bordered_table in image.tables:
                    matched = False
                    for score, inf_table in detected_tables:
                        if inf_table.bbox.box_is_inside_another(
                                bordered_table.bbox):
                            in_table, text_fields_to_match = match_table_text(
                                inf_table, text_fields_to_match)
                            paddle_fields = self.text_detector.extract_table_text(
                                img, inf_table.bbox)
                            if paddle_fields:
                                in_table = merge_text_fields(
                                    paddle_fields, in_table)

                            bordered_score = match_cells_table(
                                in_table, bordered_table)
                            if bordered_score >= score * 0.5 \
                                    and bordered_table.count_cells() >= len(inf_table.cells) * 0.5:
                                struct_table = semi_border_to_struct(
                                    bordered_table, img.shape)
                                if struct_table:
                                    page.tables.append(struct_table)
                            else:
                                page.tables.append(inf_table)
                            detected_tables.remove((score, inf_table))
                            matched = True
                            break
                    if not matched:
                        in_table, text_fields_to_match = match_table_text(
                            bordered_table, text_fields_to_match)
                        _ = match_cells_table(in_table, bordered_table)
                        struct_table = semi_border_to_struct(
                            bordered_table, img.shape)
                        if struct_table:
                            page.tables.append(struct_table)
                if detected_tables:
                    page.tables.extend(
                        [inf_table for _, inf_table in detected_tables])
            else:
                page.tables.extend([tab for _, tab in detected_tables])
        else:
            page.tables.extend([tab for _, tab in detected_tables])
        for table in page.tables:
            actualize_text(table, image_path)

        # TODO: Headers should be created only once
        cell_header_scores = []
        for table in page.tables:
            cell_header_scores.extend(
                self.header_checker.get_cell_scores(table.cells))

        self.visualizer.draw_object_and_save(
            img, cell_header_scores,
            output_path / 'cells_header' / f"{page.page_num}.png")

        tables_with_header = []
        for table in page.tables:
            header_rows = self.create_header(table.rows, headers, 6)
            table_with_header = StructuredTableHeadered.from_structured_and_rows(
                table, header_rows)
            header_cols = self.create_header(table.cols, headers, 5)
            # TODO: Cells should be actualized only once
            table_with_header.actualize_header_with_cols(header_cols)
            tables_with_header.append(table_with_header)
        page.tables = tables_with_header

        with TextExtractor(str(image_path.absolute()),
                           seg_mode=PSM.SPARSE_TEXT) as extractor:
            text_borders = [1]
            for table in page.tables:
                _, y, _, y2 = table.bbox.box
                text_borders.extend([y, y2])
            text_borders.append(img.shape[0])
            text_candidate_boxes: List[BorderBox] = []
            for i in range(len(text_borders) // 2):
                if text_borders[i * 2 + 1] - text_borders[i * 2] > 3:
                    text_candidate_boxes.append(
                        BorderBox(
                            top_left_x=1,
                            top_left_y=text_borders[i * 2],
                            bottom_right_x=img.shape[1],
                            bottom_right_y=text_borders[i * 2 + 1],
                        ))
            for box in text_candidate_boxes:
                text, _ = extractor.extract(box.top_left_x, box.top_left_y,
                                            box.width, box.height)
                if text:
                    page.text.append(TextField(box, text))

        self.visualizer.draw_object_and_save(
            img, semi_bordered_tables,
            output_path.joinpath('semi_bordered_tables').joinpath(
                image_path.name))
        self.visualizer.draw_object_and_save(
            img, page.tables,
            output_path.joinpath('tables').joinpath(image_path.name))
        page_dict = page_to_dict(page)
        if self.visualizer.should_visualize:
            save_page(page_dict,
                      output_path / 'pages' / f"{page.page_num}.json")

        return page_dict
def _actualize_line_separators(
        table: GridTable,
        image_shape: Tuple[int, int]) -> Tuple[List[int], List[int]]:
    span_candidates: Dict[int, GridCell] = {}
    for g_cell in table.cells:
        if len(g_cell.cells) > 1:
            span_candidates[len(table.cols) * g_cell.row + g_cell.col] = g_cell

    if not span_candidates:
        return [], []

    col_candidates = {}
    for g_cell in span_candidates.values():
        col_candidates[g_cell.col] = table.cols[g_cell.col]

    row_candidates = {}
    for g_cell in span_candidates.values():
        row_candidates[g_cell.row] = table.rows[g_cell.row]

    v_lines_to_add = []
    h_lines_to_add = []
    for cand_col in col_candidates.values():
        v_lines = []
        for g_cell in cand_col.g_cells:
            _, v_cell_lines = _find_lines(g_cell, g_cell.cells, image_shape)
            if v_cell_lines:
                min_v_cells = min([cell.top_left_x for cell in g_cell.cells])
                max_v_cells = max(
                    [cell.bottom_right_x for cell in g_cell.cells])
                v_cell_lines = list(
                    filter(
                        lambda line: min_v_cells < line < max_v_cells,
                        v_cell_lines,
                    ))
            v_lines.append(v_cell_lines)
        g_cell_v_line = list(zip(cand_col.g_cells, v_lines))
        cand_v_sort = list(
            filter(
                lambda x: x[3],
                sorted(
                    [(idx, len(v_cell_lines), g_cell, v_cell_lines)
                     for idx, (g_cell,
                               v_cell_lines) in enumerate(g_cell_v_line)],
                    key=lambda x: (x[1], x[2].top_left_y),
                ),
            ))

        i = 0
        while i < len(cand_v_sort):
            idx, l, g_cell, v_cell_lines = cand_v_sort[i]
            if not l:
                i += 1
                continue
            cand_g_cells = g_cell.cells.copy()
            new_v_lines = v_cell_lines
            count_not_broken = 0
            for j in range(i + 1, len(cand_v_sort)):
                jdx, _, cand_j, v_lines_j = cand_v_sort[j]
                # Try compute common v_lines
                cells_to_check = cand_g_cells.copy()
                cells_to_check.extend(cand_j.cells)
                zone = BorderBox(
                    top_left_x=g_cell.top_left_x,
                    top_left_y=g_cell.top_left_y,
                    bottom_right_x=g_cell.bottom_right_x,
                    bottom_right_y=cand_j.bottom_right_y,
                )
                _, v = _find_lines(zone, cells_to_check, image_shape)
                if v:
                    min_v_cells = min(
                        [cell.top_left_x for cell in cells_to_check])
                    max_v_cells = max(
                        [cell.bottom_right_x for cell in cells_to_check])
                    v = list(
                        filter(lambda line: min_v_cells < line < max_v_cells,
                               v))
                if len(v) >= len(v_cell_lines):
                    cand_g_cells = cells_to_check
                    new_v_lines = v
                    count_not_broken += 1
                else:
                    break
            i += count_not_broken + 1
            v_lines_to_add.extend(new_v_lines)

    for cand_row in row_candidates.values():
        h_lines = []
        for g_cell in cand_row.g_cells:
            h_cell_lines, _ = _find_lines(g_cell, g_cell.cells, image_shape)
            if h_cell_lines:
                min_h_cells = min([cell.top_left_y for cell in g_cell.cells])
                max_h_cells = max(
                    [cell.bottom_right_y for cell in g_cell.cells])
                h_cell_lines = list(
                    filter(
                        lambda line: min_h_cells < line < max_h_cells,
                        h_cell_lines,
                    ))
            h_lines.append(h_cell_lines)
        g_cell_h_line = list(zip(cand_row.g_cells, h_lines))
        cand_h_sort = sorted(
            [(idx, len(h_cell_lines), g_cell, h_cell_lines)
             for idx, (g_cell, h_cell_lines) in enumerate(g_cell_h_line)],
            key=lambda x: (x[1], x[2].top_left_y),
        )

        i = 0
        while i < len(cand_h_sort):
            idx, l, g_cell, h_cell_lines = cand_h_sort[i]
            if not l:
                i += 1
                continue
            cand_g_cells = g_cell.cells.copy()
            new_h_lines = h_cell_lines
            count_not_broken = 0
            for j in range(i + 1, len(cand_h_sort)):
                jdx, _, cand_j, h_lines_j = cand_h_sort[j]
                # Try compute common v_lines
                cells_to_check = cand_g_cells.copy()
                cells_to_check.extend(cand_j.cells)
                zone = BorderBox(
                    top_left_x=g_cell.top_left_x,
                    top_left_y=g_cell.top_left_y,
                    bottom_right_x=g_cell.bottom_right_x,
                    bottom_right_y=cand_j.bottom_right_y,
                )
                h, _ = _find_lines(zone, cells_to_check, image_shape)
                if h:
                    min_h_cells = min(
                        [cell.top_left_y for cell in cells_to_check])
                    max_h_cells = max(
                        [cell.bottom_right_y for cell in cells_to_check])
                    h = list(
                        filter(lambda line: min_h_cells < line < max_h_cells,
                               h))
                if len(h) >= len(h_cell_lines):
                    cand_g_cells = cells_to_check
                    new_h_lines = h
                    count_not_broken += 1
                else:
                    break
            i += count_not_broken + 1
            h_lines_to_add.extend(new_h_lines)
    return list(set(v_lines_to_add)), list(set(h_lines_to_add))
Ejemplo n.º 16
0
def comp_table(worksheet: Worksheet, row_dim: List[float],
               col_dim: List[float], s_cell: Tuple[int, int],
               e_cell: Tuple[int, int], headers: List[Cell]):
    m_ranges = []
    for m_range in worksheet.merged_cells.ranges:
        m_ranges.append(m_range)
    s_row, s_col = s_cell
    e_row, e_col = e_cell
    e_row = min(e_row, len(row_dim) - 1)
    e_col = min(e_col, len(col_dim) - 1)

    cells = []
    m_range_included = []
    for row in range(s_row, e_row + 1):
        for col in range(s_col, e_col + 1):
            is_in_merged = False
            cur_m_range = None
            for m_range in m_ranges:
                if (row, col) in list(m_range.cells):
                    is_in_merged = True
                    cur_m_range = m_range
                    break
            skip = False
            if is_in_merged:
                for m_range in m_range_included:
                    if (row, col) in list(m_range.cells):
                        skip = True
                        break
            if skip:
                continue
            if is_in_merged and cur_m_range:
                m_range_included.append(cur_m_range)
                cells.append(
                    CellLinked(
                        top_left_y=int(row_dim[cur_m_range.min_row - 1]),
                        top_left_x=int(col_dim[cur_m_range.min_col - 1]),
                        bottom_right_y=int(row_dim[min(cur_m_range.max_row,
                                                       len(row_dim) - 1)]),
                        bottom_right_x=int(col_dim[min(cur_m_range.max_col,
                                                       len(col_dim) - 1)]),
                        col=col - 1,
                        row=row - 1,
                        col_span=cur_m_range.max_col - cur_m_range.min_col + 1,
                        row_span=cur_m_range.max_row - cur_m_range.min_row + 1,
                        text_boxes=[
                            TextField(
                                bbox=BorderBox(
                                    top_left_y=int(
                                        row_dim[cur_m_range.min_row - 1]),
                                    top_left_x=int(
                                        col_dim[cur_m_range.min_col - 1]),
                                    bottom_right_y=int(row_dim[min(
                                        cur_m_range.max_row,
                                        len(row_dim) - 1)]),
                                    bottom_right_x=int(col_dim[min(
                                        cur_m_range.max_col,
                                        len(col_dim) - 1)]),
                                ),
                                text=extract_cell_value(
                                    cur_m_range.start_cell),
                            )
                        ],
                    ))
            else:
                cells.append(
                    CellLinked(
                        top_left_y=int(row_dim[row - 1]),
                        top_left_x=int(col_dim[col - 1]),
                        bottom_right_y=int(row_dim[row]),
                        bottom_right_x=int(col_dim[col]),
                        col=col - 1,
                        row=row - 1,
                        col_span=1,
                        row_span=1,
                        text_boxes=[
                            TextField(
                                bbox=BorderBox(
                                    top_left_y=int(row_dim[row - 1]),
                                    top_left_x=int(col_dim[col - 1]),
                                    bottom_right_y=int(row_dim[row]),
                                    bottom_right_x=int(col_dim[col]),
                                ),
                                text=extract_cell_value(
                                    worksheet.cell(row, col)),
                            )
                        ],
                    ))
    struct_table = StructuredTable(
        bbox=BorderBox(
            top_left_y=int(row_dim[s_row - 1]),
            top_left_x=int(col_dim[s_col - 1]),
            bottom_right_y=int(row_dim[e_row]),
            bottom_right_x=int(col_dim[e_col]),
        ),
        cells=cells,
    )
    struct_table_headered = get_headers_using_structured(struct_table, headers)
    if len(struct_table_headered.cells) + sum(
        [len(h) for h in struct_table_headered.header]) > 3:
        head_cells = []
        for pack in struct_table_headered.header:
            head_cells.extend(pack)
        for cell in head_cells:
            col = cell.col + 1
            row = cell.row + 1
            col_span = cell.col_span
            row_span = cell.row_span
            for r in range(row, row + row_span):
                for c in range(col, col + col_span):
                    worksheet.cell(r, c).fill = HEADER_FILL
        for cell in struct_table_headered.cells:
            col = cell.col + 1
            row = cell.row + 1
            col_span = cell.col_span
            row_span = cell.row_span
            for r in range(row, row + row_span):
                for c in range(col, col + col_span):
                    worksheet.cell(r,
                                   c).fill = PatternFill(start_color="CC55BB",
                                                         end_color="CC55BB",
                                                         fill_type="solid")
    return struct_table_headered
Ejemplo n.º 17
0
 def check_inside_and_put(inf_table: InferenceTable, inf_header: BorderBox):
     if inf_header.box_is_inside_another(inf_table.bbox):
         inf_table.header_boxes.append(inf_header)
         return True
     return False
Ejemplo n.º 18
0
 def check_inside_and_put(inf_table: InferenceTable, inf_cell: BorderBox):
     if inf_cell.box_is_inside_another(inf_table.bbox):
         inf_table.tags.append(inf_cell)
         return True
     return False
Ejemplo n.º 19
0
def match_inf_res(xlsx_path: Path, images_dir: Path):
    LOGGER.info(
        "Initializing CascadeMaskRCNN with config: %s and model: %s",
        CASCADE_CONFIG_PATH,
        CASCADE_MODEL_PATH,
    )
    cascade_rcnn_detector = CascadeRCNNInferenceService(
        CASCADE_CONFIG_PATH, CASCADE_MODEL_PATH, True)
    pages = []
    workbook = load_workbook(str(xlsx_path.absolute()), data_only=True)
    for page_num, worksheet in enumerate(workbook.worksheets):
        row_fill = {}
        for row_id in range(1, worksheet.max_row + 1):
            row_fill[row_id] = False
            for col_id in range(1, worksheet.max_column + 1):
                if worksheet.cell(row_id, col_id).value:
                    row_fill[row_id] = True
                    break
        last_row = worksheet.max_row
        for row_id, not_empty in sorted(
            [(row_id, not_empty) for row_id, not_empty in row_fill.items()],
                reverse=True,
                key=lambda x: x[0],
        ):
            if not_empty:
                if last_row == worksheet.max_row:
                    last_row += 1
                break
            last_row = row_id

        col_fill = {}
        for col_id in range(1, worksheet.max_column + 1):
            col_fill[col_id] = False
            for row_id in range(1, worksheet.max_row + 1):
                if worksheet.cell(row_id, col_id).value:
                    col_fill[col_id] = True
                    break
        last_col = worksheet.max_column
        for col_id, not_empty in sorted(
            [(col_id, not_empty) for col_id, not_empty in col_fill.items()],
                reverse=True,
                key=lambda x: x[0],
        ):
            if not_empty:
                if last_col == worksheet.max_column:
                    last_col += 1
                break
            last_col = col_id

        height = 0
        for row_id in range(1, last_row):
            if worksheet.row_dimensions[row_id].height:
                height += worksheet.row_dimensions[row_id].height
            else:
                height += DEFAULT_HEIGHT
        width = 0
        for col_id in range(1, last_col):
            if worksheet.column_dimensions[get_column_letter(col_id)].width:
                width += worksheet.column_dimensions[get_column_letter(
                    col_id)].width
            else:
                width += DEFAULT_WIDTH
        if height == 0 or width == 0:
            continue

        img = cv2.imread(str((images_dir / f"{page_num}.png").absolute()))
        if img is None:
            LOGGER.warning(
                "Image is empty or none, skipping processing on page %s",
                page_num)
            continue
        img_shape = img.shape[:2]

        tables_proposals = clust_tables(worksheet, last_row, last_col)
        row_dim, col_dim = get_grid(worksheet, last_row, last_col)
        y_scale = img_shape[0] / height
        x_scale = img_shape[1] / width
        row_dim = [dim * y_scale for dim in row_dim]
        col_dim = [dim * x_scale for dim in col_dim]

        headers = []
        if not any([s > 10000 for s in img_shape]) and last_row < 1000:
            _, headers = cascade_rcnn_detector.inference_image(
                images_dir / f"{page_num}.png", padding=200)
        tables = [
            comp_table(worksheet, row_dim, col_dim, (prop[0], prop[1]),
                       (prop[2], prop[3]), headers)
            for prop in tables_proposals
        ]

        tables = [
            table for table in tables
            if len(table.cells) + sum([len(h) for h in table.header]) > 3
        ]

        blocks = []
        blocks.extend(tables)
        prev_row_coord = 0
        for row_id in range(1, last_row):
            row_coord = prev_row_coord + (
                worksheet.row_dimensions[row_id].height
                if worksheet.row_dimensions[row_id].height else DEFAULT_HEIGHT)
            prev_col_coord = 0
            for col_id in range(1, last_col):
                col_coord = prev_col_coord + (
                    worksheet.column_dimensions[get_column_letter(col_id)].
                    width if worksheet.column_dimensions[get_column_letter(
                        col_id)].width else DEFAULT_WIDTH)
                if worksheet.cell(row_id, col_id).value and not any([
                        y1 <= row_id <= y2 and x1 <= col_id <= x2
                        for y1, x1, y2, x2 in tables_proposals
                ]):
                    text_field = TextField(
                        bbox=BorderBox(
                            top_left_x=prev_col_coord * x_scale,
                            top_left_y=prev_row_coord * y_scale,
                            bottom_right_x=col_coord * x_scale,
                            bottom_right_y=row_coord * y_scale,
                        ),
                        text=extract_cell_value(worksheet.cell(row_id,
                                                               col_id)),
                    )
                    blocks.append(text_field)
                prev_col_coord = col_coord
            prev_row_coord = row_coord

        pages.append(
            page_to_dict(
                Page(
                    page_num=page_num,
                    bbox=BorderBox(
                        top_left_x=0,
                        top_left_y=0,
                        bottom_right_x=img_shape[1],
                        bottom_right_y=img_shape[0],
                    ),
                    tables=blocks,
                )))
    workbook.save(str(xlsx_path.absolute()))
    workbook.close()
    return pages