Beispiel #1
0
        "> found pages separator line at %f (image space position) / %f (page space position)"
        % (sep_line_img_x, sep_line_page_x))

    # split the scanned double page at the separator line
    split_images = iproc_obj.split_image(sep_line_img_x)

    # split the textboxes at the separator line
    split_texts = split_page_texts(p, sep_line_page_x)

    split_texts_and_images.append((p, split_texts, split_images))

# generate a new XML and "pages" dict structure from the split pages
split_pages_xmlfile = os.path.join(
    OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml')
print("> saving split pages XML to '%s'" % split_pages_xmlfile)
split_tree, split_root, split_pages = create_split_pages_dict_structure(
    split_texts_and_images, save_to_output_path=split_pages_xmlfile)

# we don't need the original double pages any more, we'll work with 'split_pages'
del pages

#%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages

hori_lines_clusters = {}
pages_image_scaling = {
}  # scaling of the scanned page image in relation to the OCR page dimensions for each page

for p_num, p in split_pages.items():
    # get the image file of the scanned page
    imgfilebasename = p['image'][:p['image'].rindex('.')]
    imgfile = os.path.join(OUTPUTPATH, p['image'])
    sep_line_page_x = sep_line_img_x / page_scaling_x
    print("> found pages separator line at %f (image space position) / %f (page space position)"
          % (sep_line_img_x, sep_line_page_x))
    
    # split the scanned double page at the separator line
    split_images = iproc_obj.split_image(sep_line_img_x)
    
    # split the textboxes at the separator line
    split_texts = split_page_texts(p, sep_line_page_x)
    
    split_texts_and_images.append((p, split_texts, split_images))
    
# generate a new XML and "pages" dict structure from the split pages
split_pages_xmlfile = os.path.join(OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml')
print("> saving split pages XML to '%s'" % split_pages_xmlfile)
split_tree, split_root, split_pages = create_split_pages_dict_structure(split_texts_and_images,
                                                                        save_to_output_path=split_pages_xmlfile)

# we don't need the original double pages any more, we'll work with 'split_pages'
del pages

#%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages

hori_lines_clusters = {}
pages_image_scaling = {}     # scaling of the scanned page image in relation to the OCR page dimensions for each page

for p_num, p in split_pages.items():
    # get the image file of the scanned page
    imgfilebasename = p['image'][:p['image'].rindex('.')]
    imgfile = os.path.join(OUTPUTPATH, p['image'])
    
    print("page %d: detecting lines in image file '%s'..." % (p_num, imgfile))