コード例 #1
0
ファイル: main.py プロジェクト: ptlsra/DocumentClassification
for table in tables:
    worksheet = workbook.add_worksheet()

    table_entries = table.get_table_entries()

    table_roi = image[table.y:table.y + table.h, table.x:table.x + table.w]
    table_roi = cv.resize(table_roi, (table.w * mult, table.h * mult))

    cv.imwrite(out + table_name, table_roi)

    num_img = 0
    for i in range(len(table_entries)):
        row = table_entries[i]
        for j in range(len(row)):
            entry = row[j]
            entry_roi = table_roi[entry[1] * mult:(entry[1] + entry[3]) * mult,
                                  entry[0] * mult:(entry[0] + entry[2]) * mult]

            fname = out + "table/cell" + str(num_img) + ".jpg"
            cv.imwrite(fname, entry_roi)

            fname = utils.run_textcleaner(fname, num_img)
            text = utils.run_tesseract(fname, num_img, psm, oem)

            num_img += 1

            worksheet.write(i, j, text)

workbook.close()
コード例 #2
0
def getData_1(original_image):
    """GENERIC FUNCTION TO GET TABULAR AND NON TABULAR DATA"""
    ###############################################
    # CONVERT COLORSPACE TO NEGATIVE
    ###############################################
    original_image = cv.resize(
        original_image, (830, 1170), interpolation=cv.INTER_AREA
    )  # might have a lot of overhead depending on img size
    gray_image = cv.cvtColor(original_image, cv.COLOR_BGR2GRAY)

    ###############################################
    # APPLY ADAPTIVE THRESHOLD AND NEGATIVE
    ###############################################
    threshold = cv.adaptiveThreshold(gray_image, 255,
                                     cv.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv.THRESH_BINARY, 11, 2)
    threshold = cv.bitwise_not(threshold)
    # debug.showImage(threshold, "thresh", 80)#DEBUG

    ###############################################
    # EXTRACT TABLE LINES
    ###############################################
    horizontal, vertical = utils.extractTableLines(threshold, 150, 30)

    ###############################################
    # CREATE LINE MASK AND FIND EXTERNAL CONTOURS
    ###############################################
    line_mask = horizontal + vertical
    # debug.showImage(line_mask, "mask", 80)#DEBUG

    table_ctrs, _ = cv.findContours(line_mask, cv.RETR_EXTERNAL,
                                    cv.CHAIN_APPROX_SIMPLE)  # table outlines
    table_ctrs = utils.removeFlatContours(table_ctrs)

    # ###############################################
    # EXTRACT TABLES AND INDIVIDUAL CELLS AND OCR
    # ###############################################
    data = {}
    table_num = 1

    table_ctrs = utils.sortContours(
        table_ctrs,
        cv.boundingRect(
            line_mask)[2])  # sort contours left-to-right, top-to-bottom
    # debug.showContours(table_ctrs)#DEBUG

    # for each table outline contour, get cell contours then perform OCR
    for table_ctr in table_ctrs:
        x, y, w, h = cv.boundingRect(table_ctr)
        table_bbox = gray_image[y - 1:y + h + 1, x - 1:x + w + 1]

        cell_ctrs = utils.getCellContours(table_bbox, w)
        # debug.showContours(cell_ctrs)#DEBUG

        key = "table {}".format(table_num)
        data[key] = {}
        visited_rows = []
        row = 0

        for cell_ctr in cell_ctrs:
            x, y, w, h = cv.boundingRect(cell_ctr)
            cell_bbox = table_bbox[y:y + h, x:x + w]

            # detect headers for specific table style
            if cv.mean(cell_bbox)[0] < 155:
                _, cell_bbox = cv.threshold(cell_bbox, 200, 255,
                                            cv.THRESH_BINARY_INV)

            # logic to differentiate different rows and cells
            if y not in visited_rows:
                visited_rows.append(y)
                row += 1
                col = 1
                data[key]["row " + str(row)] = []
            else:
                col += 1

            # signifiy if OCR returned empty string
            v = utils.run_tesseract(cell_bbox, 6, 3)
            if v == "":
                v = "NULL"

            data[key]["row " + str(row)].append(("col " + str(col), v))

        table_num += 1

    # #############################################
    # GET NON TABLE DATA
    # #############################################
    data["non-tabular data"] = utils.getNonTabularData(gray_image, table_ctrs)

    # #############################################
    # FORMAT JSON
    # #############################################
    json_data = json.dumps(data, indent=3, ensure_ascii=False)

    print(json_data)