Exemple #1
0
    def create_grid(self, path, paint=True):
        ''' create a grid by detecting the tabular borders

        :param path: where to find the image
        :param paint: if one should paint a test picture
        :return:
        '''
        # path = blackwhitify(path)
        imgfile = path

        # create an image processing object with the scanned page
        exists = os.path.isfile(path)
        if not exists:
            logging.info("%s not found, passing" % path)
            return None
        try:
            image_to_process = imgproc.ImageProc(imgfile)
        except OSError:
            logging.info("%s is damaged" % path)
            return None

        # detect the lines
        logging.info("detecting lines in image file '%s'..." % (imgfile))
        with timeit_context('line detecting'):

            with timeit_context('hlines'):
                lines_hough = image_to_process.detect_lines(canny_low_thresh=900, canny_high_thresh=1030,
                                                            canny_kernel_size=3,
                                                            hough_rho_res=0.2,
                                                            hough_theta_res=np.pi / 20,
                                                            hough_votes_thresh=round(0.4 * image_to_process.img_w))
                logging.info("found %d lines at all" % len(lines_hough))

            with timeit_context('hcluster'):
                vertical_clusters = image_to_process.find_clusters(imgproc.DIRECTION_VERTICAL,
                                                                   find_clusters_1d_break_dist,
                                                                   dist_thresh=self.MIN_COL_WIDTH / 2)
            logging.info("thereof %d vertical clusters" % len(vertical_clusters))
            horizontal_clusters = image_to_process.find_clusters(imgproc.DIRECTION_HORIZONTAL,
                                                                 find_clusters_1d_break_dist,
                                                                 dist_thresh=self.MIN_ROW_WIDTH / 2)
            logging.info("thereof %d horizontal clusters" % len(horizontal_clusters))

        vertical_lines = [x[1][0] for x in vertical_clusters]
        horizontal_lines = [x[1][0] for x in horizontal_clusters]
        grid = make_grid_from_positions(vertical_lines, horizontal_lines)  # line_positions[p_num])
        n_rows = len(grid)
        n_cols = len(grid[0])
        logging.info("grid with %d rows, %d columns" % (n_rows, n_cols))

        return grid
def extract_data_frame(
        page_col_pos,
        page_row_pos,
        p_num,
        img_file,
        output_path,
        page,
):
    grid = make_grid_from_positions(page_col_pos, page_row_pos)
    n_rows = len(grid)
    n_cols = len(grid[0]) if n_rows > 0 else 0
    print(f"> page {p_num}: grid with {n_rows} rows, {n_cols} columns")

    output_files_basename = img_file.name.split('.')[0]
    page_grids_file = output_path + (output_files_basename + '.grids.json')
    print(f"saving page grids JSON file to '{page_grids_file}'")
    save_page_grids({p_num: grid}, page_grids_file)

    table = fit_texts_into_grid(page['texts'], grid)
    return datatable_to_dataframe(table)
            if len(
                    col_texts
            ) >= n_rows:  # there should be at least one text box per row
                filtered_col_positions.append(prev_col_x)
                last_col_x = col_x
            prev_col_x = col_x

        # manually add border for the last column because it has very few or no text boxes
        filtered_col_positions.append(
            filtered_col_positions[-1] +
            (rightmost_pos - filtered_col_positions[-1]) / 2)
        filtered_col_positions.append(rightmost_pos)

    # create the grid
    if filtered_col_positions:
        grid = make_grid_from_positions(filtered_col_positions, row_positions)

        n_rows = len(grid)
        n_cols = len(grid[0])
        print("> page %d: grid with %d rows, %d columns" %
              (p_num, n_rows, n_cols))

        page_grids[p_num] = grid
    else:  # this happens for the first page as there's no table on that
        print("> page %d: no table found" % p_num)

# save the page grids

# After you created the page grids, you should then check that they're correct using pdf2xml-viewer's
# loadGridFile() function
        prev_col_x = col_positions[0]
        for col_x in col_positions[1:]:
            col_texts = [t for t in texts_in_table if prev_col_x < t['left'] + t['width']/2 <= col_x]

            if len(col_texts) >= n_rows:   # there should be at least one text box per row
                filtered_col_positions.append(prev_col_x)
                last_col_x = col_x
            prev_col_x = col_x
        
        # manually add border for the last column because it has very few or no text boxes
        filtered_col_positions.append(filtered_col_positions[-1] + (rightmost_pos - filtered_col_positions[-1]) / 2)
        filtered_col_positions.append(rightmost_pos)

    # create the grid
    if filtered_col_positions:
        grid = make_grid_from_positions(filtered_col_positions, row_positions)
        
        n_rows = len(grid)
        n_cols = len(grid[0])
        print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols))
        
        page_grids[p_num] = grid
    else:  # this happens for the first page as there's no table on that
        print("> page %d: no table found" % p_num)
    
# save the page grids

# After you created the page grids, you should then check that they're correct using pdf2xml-viewer's 
# loadGridFile() function

page_grids_file = os.path.join(OUTPUTPATH, output_files_basename + '.pagegrids.json')
Exemple #5
0
# image space" -> we scale them
# to "text box space"

# %% Create page grids

# After you created the page grids, you should then check that they're correct using pdf2xml-viewer's
# loadGridFile() function

print("creating page grids for all pages...")
page_grids = {}
for p_num, p in pages.items():
    # grid = make_grid_from_positions(col_positions[p_num], row_positions[p_num][-13:])
    # img_grid=iproc_obj.draw_grid(col_positions_noscale[p_num],row_positions_noscale[p_num][-13:])

    grid = make_grid_from_positions(col_positions[p_num], row_positions[p_num])
    img_grid = iproc_obj.draw_grid(col_positions_noscale[p_num],
                                   row_positions_noscale[p_num])

    cv2.imwrite(os.path.join(OUTPUTPATH, imgfilebasename + '.grids.png'),
                img_grid)
    n_rows = len(grid)
    n_cols = len(grid[0])
    print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols))
    page_grids[p_num] = grid

page_grids_file = os.path.join(OUTPUTPATH, imgfilebasename + '.pagegrids.json')
print("saving page grids JSON file to '%s'" % page_grids_file)
save_page_grids(page_grids, page_grids_file)

# %% Create data frames (requires pandas library)
Exemple #6
0
    has_wide_footer_text = any(t['width'] >= min_footer_text_width
                               for t in line_texts)
    # check if there's at least one wide text or if all of the required words for a footer match
    if has_wide_footer_text or all_a_in_b(words_in_footer, line_str):
        bottom_y = line_top
        break
else:
    bottom_y = p['height']

page_rowpos = [y for y in pos_y if top_y <= y <= bottom_y]
print("> page %d: %d lines between [%f, %f]" %
      (p_num, len(page_rowpos), top_y, bottom_y))

from pdftabextract.extract import make_grid_from_positions

grid = make_grid_from_positions(page_colpos, page_rowpos)
n_rows = 17
n_cols = 12
print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols))

from pdftabextract.common import save_page_grids

page_grids_file = os.path.join(
    OUTPUTPATH, output_files_basename + '.pagegrids_p3_only.json')
print("saving page grids JSON file to '%s'" % page_grids_file)
save_page_grids({p_num: grid}, page_grids_file)

from pdftabextract.extract import fit_texts_into_grid, datatable_to_dataframe

datatable = fit_texts_into_grid(p['texts'], grid)
Exemple #7
0
    line_heights = np.diff(pos_y)

    print(
        "> page %d: %d lines between [%f, %f], median text height = %f, median line height = %f, min line height = %f, max line height = %f"
        % (p_num, len(pos_y), top_y, bottom_y, median_text_height,
           np.median(line_heights), min(line_heights), max(line_heights)))

# %% Create page grids

# After you created the page grids, you should then check that they're correct using pdf2xml-viewer's
# loadGridFile() function

print("creating page grids for all pages...")
page_grids = {}
for p_num, p in pages.items():
    grid = make_grid_from_positions(col_positions[p_num],
                                    line_positions[p_num])
    img_grid = iproc_obj.draw_grid(
        col_positions_noscale[p_num],
        map(lambda x: x * page_scaling_y, line_positions[p_num]))
    cv2.imwrite(os.path.join(OUTPUTPATH, imgfilebasename + '.grids.png'),
                img_grid)
    n_rows = len(grid)
    n_cols = len(grid[0])
    print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols))
    page_grids[p_num] = grid

page_grids_file = os.path.join(OUTPUTPATH, imgfilebasename + '.pagegrids.json')
print("saving page grids JSON file to '%s'" % page_grids_file)
save_page_grids(page_grids, page_grids_file)

# %% Create data frames (requires pandas library)
    else:  # this happens when the number of columns was not correctly detected
        diffsum = None

    if diffsum is None or diffsum > CORRECT_COLS_MIN_DIFFSUM:  # correct the columns for this page
        print('> page %d: corrected (diffsum was %f)' % (p_num, diffsum))
        x_offset = page_col_positions[p_num][0]
        corrected_pos = list(col_medians + x_offset)
        page_col_positions[p_num] = corrected_pos

#%% Create the page grids from the row and column positions and save them
page_grids = {}

for p_num, col_positions in page_col_positions.items():
    # create the grid
    row_positions = page_row_positions[p_num]
    grid = make_grid_from_positions(col_positions, row_positions)

    n_rows = len(grid)
    n_cols = len(grid[0])
    print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols))

    page_grids[p_num] = grid

# save the page grids

# After you created the page grids, you should then check that they're correct using pdf2xml-viewer's
# loadGridFile() function

page_grids_file = os.path.join(OUTPUTPATH,
                               output_files_basename + '.pagegrids.json')
print("saving page grids JSON file to '%s'" % page_grids_file)
Exemple #9
0
def do_tablextract(self, g, pdf_path, p_num):  # g is globals
    print('Starting tablextract')
    camelot_method = 'lattice'  #stream/lattice

    if self.pdf_type == 'normal':
        print(pdf_path, p_num)
        if 'tabula' in g.text_pdf_method:
            tables = read_pdf(
                pdf_path,
                pages=[p_num],
                multiple_tables=True,
                java_options=
                '-Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider')
            for i in range(len(tables)):
                table_file_path = '%s/%s-%s' % (self.tables_folder_tabula,
                                                p_num, i)
                # tables[i].fillna('').to_html('%s.html' % (table_file_path))
                try:
                    tables[i].fillna('').to_csv('%s.csv' % (table_file_path),
                                                encoding='utf-8')
                except:
                    tables[i].fillna('').to_csv('%s.csv' % (table_file_path),
                                                encoding='cp1252')
        if 'camelot' in g.text_pdf_method:
            tables = camelot.read_pdf(pdf_path,
                                      flavor=camelot_method,
                                      pages=str(p_num))
            for i in range(len(tables)):
                # print(tables[0].parsing_report)
                table_file_path = '%s/%s-%s.csv' % (self.tables_folder_camelot,
                                                    p_num, i)
                tables.export(table_file_path, f='csv', compress=False)

    else:
        if self.doc_type == 'image':
            # trying camelot
            print('Doing camelot-stream')
            camelot_method = 'stream'  #stream/lattice
            tables = camelot.read_pdf(pdf_path,
                                      flavor=camelot_method,
                                      pages=str(p_num))
            for i in range(len(tables)):
                # print(tables[0].parsing_report)
                table_file_path = '%s/%s-%s.csv' % (self.tables_folder_camelot,
                                                    p_num, i)
                tables.export(table_file_path, f='csv', compress=False)

        # Trying pdftabextract
        filename = os.path.basename(pdf_path).split('.')[0].split('/')[0]
        DATAPATH = self.images_folder  # 'data/'
        INPUT_XML = '%s/%s.xml' % (self.images_folder, filename)
        os.system("pdftohtml -c -hidden -xml -enc UTF-8  -f %s -l %s %s %s" %
                  (p_num, p_num, pdf_path, INPUT_XML))
        # os.system("pdftohtml -c -hidden -f %s -l %s %s %s/%s.html" % (p_num, p_num, pdf_path, self.html_folder, filename))

        # Load the XML that was generated with pdftohtml
        xmltree, xmlroot = read_xml(INPUT_XML)
        # parse it and generate a dict of pages
        pages = parse_pages(xmlroot)
        # print(pages[p_num]['texts'][0])
        p = pages[p_num]

        # Detecting lines
        if self.doc_type == 'image':
            imgfilebasename = '%s-%s_1' % (filename, p_num)
            imgfile = self.file_path
        elif self.doc_type == 'pdf':
            try:
                imgfilebasename = '%s-%s_1' % (filename, p_num)
                imgfile = '%s/%s-%s_1.png' % (DATAPATH, filename, p_num)
            except:
                imgfilebasename = filename + str(p_num)
                imgfile = '%s/%s-%s_1.png' % (DATAPATH, filename, p_num)

        print("\npage %d: detecting lines in image file '%s'..." %
              (p_num, imgfile))

        # create an image processing object with the scanned page
        iproc_obj = imgproc.ImageProc(imgfile)

        # calculate the scaling of the image file in relation to the text boxes coordinate system dimensions
        page_scaling_x = iproc_obj.img_w / p['width']  # scaling in X-direction
        page_scaling_y = iproc_obj.img_h / p[
            'height']  # scaling in Y-direction

        # detect the lines
        lines_hough = iproc_obj.detect_lines(canny_kernel_size=3,
                                             canny_low_thresh=50,
                                             canny_high_thresh=150,
                                             hough_rho_res=1,
                                             hough_theta_res=np.pi / 500,
                                             hough_votes_thresh=round(
                                                 0.2 * iproc_obj.img_w))
        print("> found %d lines" % len(lines_hough))

        # helper function to save an image
        def save_image_w_lines(iproc_obj, imgfilebasename):
            img_lines = iproc_obj.draw_lines(orig_img_as_background=True)
            img_lines_file = os.path.join(
                self.temp_folder, '%s-lines-orig.png' % imgfilebasename)

            print("> saving image with detected lines to '%s'" %
                  img_lines_file)
            cv2.imwrite(img_lines_file, img_lines)

        save_image_w_lines(iproc_obj, imgfilebasename)

        # find rotation or skew
        # the parameters are:
        # 1. the minimum threshold in radians for a rotation to be counted as such
        # 2. the maximum threshold for the difference between horizontal and vertical line rotation (to detect skew)
        # 3. an optional threshold to filter out "stray" lines whose angle is too far apart from the median angle of
        #    all other lines that go in the same direction (no effect here)
        rot_or_skew_type, rot_or_skew_radians = iproc_obj.find_rotation_or_skew(
            radians(0.5),  # uses "lines_hough"
            radians(1),
            omit_on_rot_thresh=radians(0.5))

        # rotate back or deskew text boxes
        needs_fix = True
        if rot_or_skew_type == ROTATION:
            print("> rotating back by %f°" % -degrees(rot_or_skew_radians))
            rotate_textboxes(p, -rot_or_skew_radians, pt(0, 0))
        elif rot_or_skew_type in (SKEW_X, SKEW_Y):
            print("> deskewing in direction '%s' by %f°" %
                  (rot_or_skew_type, -degrees(rot_or_skew_radians)))
            deskew_textboxes(p, -rot_or_skew_radians, rot_or_skew_type,
                             pt(0, 0))
        else:
            needs_fix = False
            print("> no page rotation / skew found")

        if needs_fix:
            # rotate back or deskew detected lines
            lines_hough = iproc_obj.apply_found_rotation_or_skew(
                rot_or_skew_type, -rot_or_skew_radians)

            save_image_w_lines(iproc_obj, imgfilebasename + '-repaired')

        # save repaired XML (i.e. XML with deskewed textbox positions)

        repaired_xmlfile = os.path.join(self.temp_folder,
                                        filename + '.repaired.xml')

        print("saving repaired XML file to '%s'..." % repaired_xmlfile)
        xmltree.write(repaired_xmlfile)

        # Clustering vertical lines
        # cluster the detected *vertical* lines using find_clusters_1d_break_dist as simple clustering function
        # (break on distance MIN_COL_WIDTH/2)
        # additionally, remove all cluster sections that are considered empty
        # a cluster is considered empty when the number of text boxes in it is below 10% of the median number of text boxes
        # per cluster section
        MIN_COL_WIDTH = g.MIN_COL_WIDTH  # minimum width of a column in pixels, measured in the scanned pages
        vertical_clusters = iproc_obj.find_clusters(
            imgproc.DIRECTION_VERTICAL,
            find_clusters_1d_break_dist,
            remove_empty_cluster_sections_use_texts=p[
                'texts'],  # use this page's textboxes
            remove_empty_cluster_sections_n_texts_ratio=0.1,  # 10% rule
            remove_empty_cluster_sections_scaling=
            page_scaling_x,  # the positions are in "scanned image space" -> we scale them to "text box space"
            dist_thresh=MIN_COL_WIDTH / 2)
        print("> found %d clusters" % len(vertical_clusters))

        # draw the clusters
        img_w_clusters = iproc_obj.draw_line_clusters(
            imgproc.DIRECTION_VERTICAL, vertical_clusters)
        save_img_file = os.path.join(
            self.temp_folder, '%s-vertical-clusters.png' % imgfilebasename)
        print("> saving image with detected vertical clusters to '%s'" %
              save_img_file)
        cv2.imwrite(save_img_file, img_w_clusters)

        # Clustering horizontal lines
        # cluster the detected *horizontal* lines using find_clusters_1d_break_dist as simple clustering function
        # (break on distance MIN_ROW_WIDTH/2)
        # additionally, remove all cluster sections that are considered empty
        # a cluster is considered empty when the number of text boxes in it is below 10% of the median number of text boxes
        # per cluster section
        MIN_ROW_WIDTH = g.MIN_ROW_WIDTH  # minimum width of a row in pixels, measured in the scanned pages
        horizontal_clusters = iproc_obj.find_clusters(
            imgproc.DIRECTION_HORIZONTAL,
            find_clusters_1d_break_dist,
            remove_empty_cluster_sections_use_texts=p[
                'texts'],  # use this page's textboxes
            remove_empty_cluster_sections_n_texts_ratio=0.1,  # 10% rule
            remove_empty_cluster_sections_scaling=
            page_scaling_y,  # the positions are in "scanned image space" -> we scale them to "text box space"
            dist_thresh=MIN_ROW_WIDTH / 2)
        print("> found %d clusters" % len(horizontal_clusters))

        # draw the clusters
        img_w_clusters_hoz = iproc_obj.draw_line_clusters(
            imgproc.DIRECTION_HORIZONTAL, horizontal_clusters)
        save_img_file = os.path.join(
            self.temp_folder, '%s-horizontal-clusters.png' % imgfilebasename)
        print("> saving image with detected vertical clusters to '%s'" %
              save_img_file)
        cv2.imwrite(save_img_file, img_w_clusters_hoz)

        page_colpos = np.array(
            calc_cluster_centers_1d(vertical_clusters)) / page_scaling_x
        print('found %d column borders:' % len(page_colpos))
        print(page_colpos)

        page_rowpos = np.array(
            calc_cluster_centers_1d(horizontal_clusters)) / page_scaling_y
        print('found %d row borders:' % len(page_rowpos))
        print(page_rowpos)

        # right border of the second column
        col2_rightborder = page_colpos[2]

        # calculate median text box height
        median_text_height = np.median([t['height'] for t in p['texts']])

        # get all texts in the first two columns with a "usual" textbox height
        # we will only use these text boxes in order to determine the line positions because they are more "stable"
        # otherwise, especially the right side of the column header can lead to problems detecting the first table row
        text_height_deviation_thresh = median_text_height / 2
        texts_cols_1_2 = [
            t for t in p['texts'] if t['right'] <= col2_rightborder
            and abs(t['height'] -
                    median_text_height) <= text_height_deviation_thresh
        ]

        # get all textboxes' top and bottom border positions
        borders_y = border_positions_from_texts(texts_cols_1_2,
                                                DIRECTION_VERTICAL)

        # break into clusters using half of the median text height as break distance
        clusters_y = find_clusters_1d_break_dist(
            borders_y, dist_thresh=median_text_height / 2)
        clusters_w_vals = zip_clusters_and_values(clusters_y, borders_y)

        # for each cluster, calculate the median as center
        pos_y = calc_cluster_centers_1d(clusters_w_vals)
        pos_y.append(p['height'])

        print('number of line positions:', len(pos_y))

        pttrn_table_row_beginning = re.compile(
            r'^[\d Oo][\d Oo]{2,} +[A-ZÄÖÜ]')

        # 1. try to find the top row of the table
        texts_cols_1_2_per_line = split_texts_by_positions(
            texts_cols_1_2,
            pos_y,
            DIRECTION_VERTICAL,
            alignment='middle',
            enrich_with_positions=True)

        # go through the texts line per line
        for line_texts, (line_top, line_bottom) in texts_cols_1_2_per_line:
            line_str = join_texts(line_texts)
            if pttrn_table_row_beginning.match(
                    line_str
            ):  # check if the line content matches the given pattern
                top_y = line_top
                break
        else:
            top_y = 0

        print('Top_y: %s' % top_y)

        # hints for a footer text box
        words_in_footer = ('anzeige', 'annahme', 'ala')

        # 2. try to find the bottom row of the table
        min_footer_text_height = median_text_height * 1.5
        min_footer_y_pos = p['height'] * 0.7
        # get all texts in the lower 30% of the page that have are at least 50% bigger than the median textbox height
        bottom_texts = [
            t for t in p['texts'] if t['top'] >= min_footer_y_pos
            and t['height'] >= min_footer_text_height
        ]
        bottom_texts_per_line = split_texts_by_positions(
            bottom_texts,
            pos_y + [p['height']],  # always down to the end of the page
            DIRECTION_VERTICAL,
            alignment='middle',
            enrich_with_positions=True)
        # go through the texts at the bottom line per line
        page_span = page_colpos[-1] - page_colpos[0]
        min_footer_text_width = page_span * 0.8
        for line_texts, (line_top, line_bottom) in bottom_texts_per_line:
            line_str = join_texts(line_texts)
            has_wide_footer_text = any(t['width'] >= min_footer_text_width
                                       for t in line_texts)
            # check if there's at least one wide text or if all of the required words for a footer match
            if has_wide_footer_text or all_a_in_b(words_in_footer, line_str):
                bottom_y = line_top
                break
        else:
            bottom_y = p['height']

        print(bottom_y)
        print(pos_y)

        # finally filter the line positions so that only the lines between the table top and bottom are left
        print(page_rowpos)
        print("> page %d: %d lines between [%f, %f]" %
              (p_num, len(page_rowpos), top_y, bottom_y))

        def subsequent_pairs(l):
            """
            Return subsequent pairs of values in a list <l>, i.e. [(x1, x2), (x2, x3), (x3, x4), .. (xn-1, xn)] for a
            list [x1 .. xn]
            """
            return [(l[i - 1], v) for i, v in enumerate(l) if i > 0]

        # page_rowpos = [y for y in pos_y if top_y <= y <= bottom_y]
        print(page_colpos, page_rowpos)
        grid = make_grid_from_positions(page_colpos, page_rowpos)
        # print(grid)
        n_rows = len(grid)
        n_cols = len(grid[0])
        print("> page %d: grid with %d rows, %d columns" %
              (p_num, n_rows, n_cols))

        page_grids_file = os.path.join(self.temp_folder,
                                       filename + '_pagegrids.json')
        print("saving page grids JSON file to '%s'" % page_grids_file)
        save_page_grids({p_num: grid}, page_grids_file)

        datatable = fit_texts_into_grid(p['texts'], grid)
        df = datatable_to_dataframe(datatable)
        # print(df.head(n=2))

        csv_output_file = os.path.join(self.tables_folder, filename + '.csv')
        print("saving extracted data to '%s'" % csv_output_file)
        df.to_csv(csv_output_file, index=False, header=False)
    line_heights = np.diff(pos_y)
    
    print("> page %d: %d lines between [%f, %f], median text height = %f, median line height = %f, min line height = %f, max line height = %f"
          % (p_num, len(pos_y), top_y, bottom_y,
             median_text_height, np.median(line_heights),
             min(line_heights), max(line_heights)))

#%% Create page grids

# After you created the page grids, you should then check that they're correct using pdf2xml-viewer's 
# loadGridFile() function

print("creating page grids for all pages...")
page_grids = {}
for p_num, p in pages.items():
    grid = make_grid_from_positions(col_positions[p_num], line_positions[p_num])
    n_rows = len(grid)
    n_cols = len(grid[0])
    print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols))
    page_grids[p_num] = grid

page_grids_file = os.path.join(OUTPUTPATH, output_files_basename + '.pagegrids.json')
print("saving page grids JSON file to '%s'" % page_grids_file)
save_page_grids(page_grids, page_grids_file)

#%% Create data frames (requires pandas library)

# For sake of simplicity, we will just fit the text boxes into the grid, merge the texts in their cells and
# output the result.
# Normally, at this step you will need to do some error correction / parsing as some text boxes are not correctly
# detected during OCR (they might span over multiple columns or numbers are incorrectly detected as letters).
        diffsum = None
    
    if diffsum is None or diffsum > CORRECT_COLS_MIN_DIFFSUM:   # correct the columns for this page
        print('> page %d: corrected (diffsum was %f)' % (p_num, diffsum))
        x_offset = page_col_positions[p_num][0]
        corrected_pos = list(col_medians + x_offset)
        page_col_positions[p_num] = corrected_pos


#%% Create the page grids from the row and column positions and save them
page_grids = {}

for p_num, col_positions in page_col_positions.items():
    # create the grid
    row_positions = page_row_positions[p_num]
    grid = make_grid_from_positions(col_positions, row_positions)
    
    n_rows = len(grid)
    n_cols = len(grid[0])
    print("> page %d: grid with %d rows, %d columns" % (p_num, n_rows, n_cols))
    
    page_grids[p_num] = grid

# save the page grids

# After you created the page grids, you should then check that they're correct using pdf2xml-viewer's 
# loadGridFile() function
    
page_grids_file = os.path.join(OUTPUTPATH, output_files_basename + '.pagegrids.json')
print("saving page grids JSON file to '%s'" % page_grids_file)
save_page_grids(page_grids, page_grids_file)