save_image_w_lines(iproc_obj, imgfilebasename, True, 'bothpages-') # find the vertical line that separates both sides sep_line_img_x = iproc_obj.find_pages_separator_line( dist_thresh=MIN_COL_WIDTH / 2) sep_line_page_x = sep_line_img_x / page_scaling_x print( "> found pages separator line at %f (image space position) / %f (page space position)" % (sep_line_img_x, sep_line_page_x)) # split the scanned double page at the separator line split_images = iproc_obj.split_image(sep_line_img_x) # split the textboxes at the separator line split_texts = split_page_texts(p, sep_line_page_x) split_texts_and_images.append((p, split_texts, split_images)) # generate a new XML and "pages" dict structure from the split pages split_pages_xmlfile = os.path.join( OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml') print("> saving split pages XML to '%s'" % split_pages_xmlfile) split_tree, split_root, split_pages = create_split_pages_dict_structure( split_texts_and_images, save_to_output_path=split_pages_xmlfile) # we don't need the original double pages any more, we'll work with 'split_pages' del pages #%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages
hough_votes_thresh=350) print("> found %d lines" % len(lines_hough)) save_image_w_lines(iproc_obj, imgfilebasename, True, 'bothpages-') # find the vertical line that separates both sides sep_line_img_x = iproc_obj.find_pages_separator_line(dist_thresh=MIN_COL_WIDTH/2) sep_line_page_x = sep_line_img_x / page_scaling_x print("> found pages separator line at %f (image space position) / %f (page space position)" % (sep_line_img_x, sep_line_page_x)) # split the scanned double page at the separator line split_images = iproc_obj.split_image(sep_line_img_x) # split the textboxes at the separator line split_texts = split_page_texts(p, sep_line_page_x) split_texts_and_images.append((p, split_texts, split_images)) # generate a new XML and "pages" dict structure from the split pages split_pages_xmlfile = os.path.join(OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml') print("> saving split pages XML to '%s'" % split_pages_xmlfile) split_tree, split_root, split_pages = create_split_pages_dict_structure(split_texts_and_images, save_to_output_path=split_pages_xmlfile) # we don't need the original double pages any more, we'll work with 'split_pages' del pages #%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages hori_lines_clusters = {}