def split_pdf_to_png_files (pdf_file_spec,output_dir): """ Split the PDF file specified by PDF_FILE_SPEC into a series of files, each representing a single page as a PNG image. Write files to directory specified by OUTPUT_DIR. """ png_files = None try: # sanity check if not os.path.isabs(pdf_file_spec): msg = "The input PDF must be specified as an absolute file path" lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec])) sys.exit(msg) else: # array of (<file_name>,<page_number>) tuples png_specs = pdf.pdf_to_pngs(pdf_file_spec,output_dir) except Exception as e: msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec) lg.error(msg) print "failed to convert PDF file(s): %s" % pdf_file_spec print "e: %s" % e lg.info(json1.json_last_log_msg()) sys.exit(msg) else: lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec,png_specs)) return png_specs
def invoke_pdfimages_on (pdf_file_spec,output_dir): """ Extract images in PDF file specified by PDF_FILE_SPEC into a series of files, each representing a single PNG image. Write files to directory specified by OUTPUT_DIR. Returns a list of tuples where each tuple has the structure (png_file,png_file_page_number) png_file_page_number is an integer. The list is an ordered sequence with respect to page number - low to high. """ png_file_page_number_tuples = None try: # sanity check if not os.path.isabs(pdf_file_spec): msg = "The input PDF must be specified as an absolute file path" lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec])) sys.exit(msg) else: png_file_page_number_tuples = pdf.pdfimages(pdf_file_spec,output_dir) except Exception as e: lg.debug(str(e)) msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec) lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit(msg) else: # Is it really import to log png files? (Need to dig them out of tuples...) lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec, None #png_files )) return png_file_page_number_tuples
def barcodeScan(imagePNGPath, scan_region): """ Return None if a barcode was not found. If a barcode was found, return a string corresponding to the barcode-encoded data. Search within the region defined by SCAN_REGION when SCAN_REGION is a list. When SCAN_REGION is a list, it specifies two points as [x1,y1,x2,y2]. These two points (x1,y1) and (x2,y2) are pairs (x,y) of percentages (each expressed as a value between 0 and 1.0) relative to the dimensions of the image; they define the box within which the barcode scan occurs. If SCAN_REGION is not a list, the full image is analyzed. If analysis of the full image is desirable, do not set SCAN_REGION to [0,0,1,1] but instead set it to None or some other non-list value. """ # sanity check(s) if not isinstance(scan_region,list): scan_region = None else: for value in scan_region: if (value < 0 or value > 1): msg = json1.json_msg(999,"insane scan region value",False,None) lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit(msg) # obtain image data either via PIL or CV2/numpy # 1. using pil # PIL origin (0,0) is top left corner pil = Image.open(imagePNGPath).convert('L') # 'L' is "black and white mode": converts to 8-bit pixels B/W # 2. using cv2/numpy #pil_1 = Image.open(imagePNGPath) #frame = pil_1.convert("RGB") #pil_gray = cv2.cvtColor(numpy.array(frame), cv2.COLOR_BGR2GRAY, dstCn=0) #pil = Image.fromarray(pil_gray) pilCropped = pil width, height = pil.size lg.debug("width: %s height: %s",width,height) if scan_region: # relative (percentage) values between 0 and 1 x_crop_min = min(scan_region[0],scan_region[2]) x_crop_max = max(scan_region[0],scan_region[2]) y_crop_min = min(scan_region[1],scan_region[3]) y_crop_max = max(scan_region[1],scan_region[3]) cropTop=int(height*y_crop_min) cropBottom=int(height*y_crop_max) cropLeft=int(height*x_crop_min) cropRight=int(height*x_crop_max) # crop box is 4-tuple: left,upper,right,lower pilCropBox = [cropLeft,cropTop,cropRight,cropBottom] pilCropped = pil.crop(pilCropBox) # zbar sometimes catches a barcode at a lower resolution but misses it at a higher resolution. Scan for barcode with several variants of image specified by IMAGE_FILE_SPEC. barcodeString = barcode_scan_at_resolutions(pilCropped,None) if ( not barcodeString ): lg.warn(json1.json_barcode_not_found_msg([imagePNGPath],"")) return barcodeString
def module_sanity_check (module_name,exitp): """MODULE_NAME is a string""" # check for presence of module which might not be installed/accessible try: imp.find_module(module_name) except ImportError: msg = json1.json_msg_module_not_accessible(module_name) lg.error(msg) lg.info(json1.json_last_log_msg()) if exitp: sys.exit(msg)
def executable_sanity_checks (executables): """ Check for availability of executables specified in the list of strings EXECUTABLES. """ lg.debug(util) for executable_spec in executables: if not util.which(executable_spec): msg = json1.json_msg_executable_not_accessible(executable_spec) lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit(msg)
def signal_handler(signal, frame): # Ensure receipt of signal is logged prior to terminating msg = json1.json_exit_on_external_request_msg() lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit()
def main(): """Handle command-line invocation of pdfxcb.py.""" global lg parser = argparse.ArgumentParser(description="This is pdfxcb") parser.add_argument("-f", help="absolute path to log file", action="store", dest="log_file", type=str) parser.add_argument("-d", help="absolute path to output directory", action="store", dest="output_dir", type=str) parser.add_argument("-m", help="match barcodes to regex (ignore if no match)", action="store", dest="match_re_string", type=str) parser.add_argument("-p", help="identifier for a specific instance of pdfxcb", action="store", dest="identifier", type=str) parser.add_argument("-l", help="integer between 0 (verbose) and 51 (terse) defining logging", action="store", dest="log_level", type=int) #parser.add_argument('-v', '--version', action='version', version=version.version) parser.add_argument("input_files", help="an input (PDF) file", # keep nargs as we may want to accept multiple PDFs as input at some point nargs=1, type=str) args = parser.parse_args() # # define logging (level, file, message format, ...) # log_level = args.log_level if isinstance(log_level, int) and log_level >= 0 and log_level <= 51: log_level = log_level else: # since this function doesn't necessarily exit quickly log_level = logging.INFO if args.log_file: logfile = args.log_file else: logfile = 'busca.log' json_log_format = '%(message)s' for handler in lg.getLogger().handlers: lg.getLogger().removeHandler(handler) formatter = logging.Formatter(json_log_format) # sanity check for existence of log file directory if (os.path.dirname(logfile) and not os.path.exists(os.path.dirname(logfile))): raise Exception(str.format("log file directory {0} not present", os.path.dirname(logfile))) file_handler = logging.FileHandler(logfile,'w') file_handler.setFormatter(formatter) lg.getLogger().addHandler(file_handler) lg.getLogger().setLevel(log_level) lg.debug("args: %s", args) lg.debug("sys.argv: %s",sys.argv) if args.identifier: identifier = args.identifier else: identifier = str(uuid.uuid1()) # 1000[0-9][0-9][0-9]$ matches on tt user id match_re_string = args.match_re_string lg.debug(match_re_string) match_re = None if match_re_string: match_re = re.compile(match_re_string) pdf_file_spec = args.input_files[0] lg.debug(pdf_file_spec) lg.info(json1.json_first_log_msg(identifier, files = [pdf_file_spec] )) rasterize_p = False # generic debugging lg.debug(os.getcwd()) # current/working directory # might also want to import platform to get architecture, other details... try: pdfxcb(pdf_file_spec,args.output_dir,match_re,rasterize_p) except Exception as e: lg.error("Crash and burn") lg.error(sys.exc_info()[0]) raise lg.info(json1.json_last_log_msg())
def file_sanity_check (file,exitp): if not os.path.isfile(file): lg.error(json1.json_file_not_found(file)) lg.info(json1.json_last_log_msg()) if exitp: sys.exit("File " + file + " not found.")
def directory_sanity_check (directory_spec,exitp): if not os.path.isdir(directory_spec): lg.error(json1.json_file_not_found(directory_spec)) lg.info(json1.json_last_log_msg()) if exitp: sys.exit("Directory " + directory_spec + " not found.")
# bubbles.py, deskew.py, feature_detect.py, and util.py use numpy # try: # imp.find_module('numpy') # except ImportError: # msg = json1.json_msg_module_not_accessible('numpy') # lg.error(msg) # lg.info(json1.json_last_log_msg()) # sys.exit(msg) try: imp.find_module('PyPDF2') except ImportError: msg = json1.json_msg_module_not_accessible('PyPDF2') lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit(msg) import PyPDF2 # handle external signals requesting termination def signal_handler(signal, frame): # Ensure receipt of signal is logged prior to terminating msg = json1.json_exit_on_external_request_msg() lg.error(msg) lg.info(json1.json_last_log_msg()) sys.exit() signal.signal(signal.SIGHUP, signal_handler) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler)