def split_pdf_to_png_files (pdf_file_spec,output_dir): """ Split the PDF file specified by PDF_FILE_SPEC into a series of files, each representing a single page as a PNG image. Write files to directory specified by OUTPUT_DIR. """ png_files = None try: # sanity check if not os.path.isabs(pdf_file_spec): msg = "The input PDF must be specified as an absolute file path" lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec])) sys.exit(msg) else: # array of (<file_name>,<page_number>) tuples png_specs = pdf.pdf_to_pngs(pdf_file_spec,output_dir) except Exception as e: msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec) lg.error(msg) print "failed to convert PDF file(s): %s" % pdf_file_spec print "e: %s" % e lg.info(json1.json_last_log_msg()) sys.exit(msg) else: lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec,png_specs)) return png_specs
def invoke_pdfimages_on (pdf_file_spec,output_dir): """ Extract images in PDF file specified by PDF_FILE_SPEC into a series of files, each representing a single PNG image. Write files to directory specified by OUTPUT_DIR. Returns a list of tuples where each tuple has the structure (png_file,png_file_page_number) png_file_page_number is an integer. The list is an ordered sequence with respect to page number - low to high. """ png_file_page_number_tuples = None try: # sanity check if not os.path.isabs(pdf_file_spec): msg = "The input PDF must be specified as an absolute file path" lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec])) sys.exit(msg) else: png_file_page_number_tuples = pdf.pdfimages(pdf_file_spec,output_dir) except Exception as e: lg.debug(str(e)) msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec) lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit(msg) else: # Is it really import to log png files? (Need to dig them out of tuples...) lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec, None #png_files )) return png_file_page_number_tuples
def pdf_to_pngs(pdf_file, output_dir): """ Generate PNG files, one corresponding to each page of the PDF file PDF_FILE. Write files to directory specified by OUTPUT_DIR. Return a list of the PNG file names. """ input_file_sans_suffix, input_file_suffix = os.path.splitext(pdf_file) maybe_dir, input_file_name_only = os.path.split(input_file_sans_suffix) number_of_pages = None outfile_root = input_file_name_only # determine number of pages reader = PyPDF2.PdfFileReader(file(pdf_file, "rb")) # getNumPages can fail if the PDF, or an object therein, is # corrupt try: number_of_pages = reader.getNumPages() lg.info(json1.json_pdf_info(number_of_pages)) except Exception as e: lg.error( json1.json_msg( 109, "Failure to open or parse a PDF file -- possible indication of a corrupt PDF", None, file=pdf_file)) raise e # Qs: # 1. advantages/disadvantages of gs and pdftoppm = ? # 2. is there really no way to just scan directly from PDF, specifying page number as we go? return pdf_to_pngs__pdftoppm(pdf_file, number_of_pages, outfile_root, output_dir)
def barcodeScan(imagePNGPath, scan_region): """ Return None if a barcode was not found. If a barcode was found, return a string corresponding to the barcode-encoded data. Search within the region defined by SCAN_REGION when SCAN_REGION is a list. When SCAN_REGION is a list, it specifies two points as [x1,y1,x2,y2]. These two points (x1,y1) and (x2,y2) are pairs (x,y) of percentages (each expressed as a value between 0 and 1.0) relative to the dimensions of the image; they define the box within which the barcode scan occurs. If SCAN_REGION is not a list, the full image is analyzed. If analysis of the full image is desirable, do not set SCAN_REGION to [0,0,1,1] but instead set it to None or some other non-list value. """ # sanity check(s) if not isinstance(scan_region,list): scan_region = None else: for value in scan_region: if (value < 0 or value > 1): msg = json1.json_msg(999,"insane scan region value",False,None) lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit(msg) # obtain image data either via PIL or CV2/numpy # 1. using pil # PIL origin (0,0) is top left corner pil = Image.open(imagePNGPath).convert('L') # 'L' is "black and white mode": converts to 8-bit pixels B/W # 2. using cv2/numpy #pil_1 = Image.open(imagePNGPath) #frame = pil_1.convert("RGB") #pil_gray = cv2.cvtColor(numpy.array(frame), cv2.COLOR_BGR2GRAY, dstCn=0) #pil = Image.fromarray(pil_gray) pilCropped = pil width, height = pil.size lg.debug("width: %s height: %s",width,height) if scan_region: # relative (percentage) values between 0 and 1 x_crop_min = min(scan_region[0],scan_region[2]) x_crop_max = max(scan_region[0],scan_region[2]) y_crop_min = min(scan_region[1],scan_region[3]) y_crop_max = max(scan_region[1],scan_region[3]) cropTop=int(height*y_crop_min) cropBottom=int(height*y_crop_max) cropLeft=int(height*x_crop_min) cropRight=int(height*x_crop_max) # crop box is 4-tuple: left,upper,right,lower pilCropBox = [cropLeft,cropTop,cropRight,cropBottom] pilCropped = pil.crop(pilCropBox) # zbar sometimes catches a barcode at a lower resolution but misses it at a higher resolution. Scan for barcode with several variants of image specified by IMAGE_FILE_SPEC. barcodeString = barcode_scan_at_resolutions(pilCropped,None) if ( not barcodeString ): lg.warn(json1.json_barcode_not_found_msg([imagePNGPath],"")) return barcodeString
def pdf_number_of_pages(pdf_file): """ Determine the number of pages in a PDF document. Return an integer. """ reader = PyPDF2.PdfFileReader(file(pdf_file, "rb")) # getNumPages can fail if the PDF, or an object therein, is # corrupt try: return reader.getNumPages() except Exception as e: lg.error( json1.json_msg( 109, "Failure to open or parse a PDF file -- possible indication of a corrupt PDF", None, file=pdf_file)) raise e
def pdfxcb (pdf_file_spec,output_dir,match_re,rasterize_p): """ Given the file specified by PDF_FILE_SPEC, look for cover sheets and split the PDF at each coversheet. Name output file(s) based on cover sheet content. Write files to directory specified by OUTPUT_DIR. Return True. If MATCH_RE is defined, ignore barcodes unless the corresponding string matches the regex MATCH_RE. Use RASTERIZE_P = False if the PDF does not contain vector graphics but is solely bitmap data (e.g., the PDF was generated from a scanned document). """ global lg sanity_checks([output_dir],[pdf_file_spec]) # If confident that the PDF under analysis is derived from a scan # (i.e., contains only bitmap data), then the images embedded in # the PDF can be analyzed directly. If the PDF may contain vector # data on the cover sheet pages, then rasterization is indicated. # See doc/optimization.md for notes on time implications. # PNG_FILE_PAGE_NUMBER_TUPLES is an array where each member has # the form (<PNG file name>, <PDF page number>). There is no # guarantee that all pages in the original PDF document are # represented. Furthermore, there may be multiple PNG images per # PDF page -- i.e., the array might include ("flurpies.png",1) and # ("glurpies.png",1). # FIXME: consider having a single call here -- FOO -- that specializes on rasterize_p if rasterize_p: # extract PDF pages as image data (PNG files) png_file_page_number_tuples = split_pdf_to_png_files(pdf_file_spec,output_dir) # Once rasterized pages are generated, optionally scan for cue marks # CUE_INDICES = array where each member is an integer indicating index of member of png_file_page_number_tuples where the corresponding bitmap has a cue mark # cue_indices = scan_for_cue_marks(png_file_page_number_tuples) <-- use urh_corner_mean w/reasonable threshold (10? 20? 50?) for "black" else: # extract images directly from PDF png_file_page_number_tuples = invoke_pdfimages_on(pdf_file_spec,output_dir) # Code below expects png_file_page_number_tuples to be ordered with respect to page number. # Note that sorted default is ascending order. png_file_page_number_tuples = sorted(png_file_page_number_tuples, key=lambda tuple: tuple[1]) # # locate cover sheets # if rasterize_p: # possibilities: # 1. png files represent rasterized pages scan_region = ([0,0,0.7,0.5]) else: # 2. png files represent images from PDF (via pdfimages) scan_region = None # None is not treated as the equivalent of ([0,0,1,1]). ([0,0,1,1]) triggers cropping by barcodeScan. cover_sheet_barcodes, cover_sheet_indices = locate_cover_sheets(png_file_page_number_tuples,output_dir,match_re,scan_region) print(cover_sheet_barcodes) lg.debug(cover_sheet_barcodes) lg.debug(cover_sheet_indices) # Setting to False supports debugging/development. This should be set to True in production. clean_up_png_files = False # False # True if clean_up_png_files: for png_file_tuple in png_file_page_number_tuples: os.remove(os.path.join(output_dir,png_file_tuple[0])) # write PDFs pdf_length = pdf.pdf_number_of_pages(pdf_file_spec) # len(png_files) only works if PNGs are rasterized pages page_ranges = generate_page_ranges(cover_sheet_indices,png_file_page_number_tuples,pdf_length) output_file_names = generate_output_file_names(cover_sheet_barcodes,cover_sheet_indices,output_dir) lg.debug(output_file_names) pdf.pdf_split(pdf_file_spec,output_file_names,page_ranges) lg.info(json1.json_msg(40, ['Analysis and burst completed'], False, files=output_file_names, data={ 'barcodes': cover_sheet_barcodes, 'indices': cover_sheet_indices } )) return True