コード例 #1
0
ファイル: schoollist_1.py プロジェクト: sohailsk10/PDF_TO_OCR
    save_image_w_lines(iproc_obj, imgfilebasename, True, 'bothpages-')

    # find the vertical line that separates both sides
    sep_line_img_x = iproc_obj.find_pages_separator_line(
        dist_thresh=MIN_COL_WIDTH / 2)
    sep_line_page_x = sep_line_img_x / page_scaling_x
    print(
        "> found pages separator line at %f (image space position) / %f (page space position)"
        % (sep_line_img_x, sep_line_page_x))

    # split the scanned double page at the separator line
    split_images = iproc_obj.split_image(sep_line_img_x)

    # split the textboxes at the separator line
    split_texts = split_page_texts(p, sep_line_page_x)

    split_texts_and_images.append((p, split_texts, split_images))

# generate a new XML and "pages" dict structure from the split pages
split_pages_xmlfile = os.path.join(
    OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml')
print("> saving split pages XML to '%s'" % split_pages_xmlfile)
split_tree, split_root, split_pages = create_split_pages_dict_structure(
    split_texts_and_images, save_to_output_path=split_pages_xmlfile)

# we don't need the original double pages any more, we'll work with 'split_pages'
del pages

#%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages
コード例 #2
0
                                         hough_votes_thresh=350)
    print("> found %d lines" % len(lines_hough))
    
    save_image_w_lines(iproc_obj, imgfilebasename, True, 'bothpages-')
    
    # find the vertical line that separates both sides
    sep_line_img_x = iproc_obj.find_pages_separator_line(dist_thresh=MIN_COL_WIDTH/2)
    sep_line_page_x = sep_line_img_x / page_scaling_x
    print("> found pages separator line at %f (image space position) / %f (page space position)"
          % (sep_line_img_x, sep_line_page_x))
    
    # split the scanned double page at the separator line
    split_images = iproc_obj.split_image(sep_line_img_x)
    
    # split the textboxes at the separator line
    split_texts = split_page_texts(p, sep_line_page_x)
    
    split_texts_and_images.append((p, split_texts, split_images))
    
# generate a new XML and "pages" dict structure from the split pages
split_pages_xmlfile = os.path.join(OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml')
print("> saving split pages XML to '%s'" % split_pages_xmlfile)
split_tree, split_root, split_pages = create_split_pages_dict_structure(split_texts_and_images,
                                                                        save_to_output_path=split_pages_xmlfile)

# we don't need the original double pages any more, we'll work with 'split_pages'
del pages

#%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages

hori_lines_clusters = {}