Example #1
0
def split_pdf_to_png_files (pdf_file_spec,output_dir):
    """
    Split the PDF file specified by PDF_FILE_SPEC into a series of
    files, each representing a single page as a PNG image. Write files
    to directory specified by OUTPUT_DIR.
    """
    png_files = None
    try:
        # sanity check
        if not os.path.isabs(pdf_file_spec):
            msg = "The input PDF must be specified as an absolute file path"
            lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec]))
            sys.exit(msg)
        else:
            # array of (<file_name>,<page_number>) tuples
            png_specs = pdf.pdf_to_pngs(pdf_file_spec,output_dir)
    except Exception as e:
        msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec)
        lg.error(msg)
        print "failed to convert PDF file(s): %s" % pdf_file_spec
        print "e: %s" % e
        lg.info(json1.json_last_log_msg())
        sys.exit(msg)
    else:
        lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec,png_specs))
        return png_specs
Example #2
0
def invoke_pdfimages_on (pdf_file_spec,output_dir):
    """
    Extract images in PDF file specified by PDF_FILE_SPEC into a
    series of files, each representing a single PNG image. Write files
    to directory specified by OUTPUT_DIR.

    Returns a list of tuples where each tuple has the structure
    (png_file,png_file_page_number) png_file_page_number is an
    integer. The list is an ordered sequence with respect to page
    number - low to high.
    """
    png_file_page_number_tuples = None
    try:
        # sanity check
        if not os.path.isabs(pdf_file_spec):
            msg = "The input PDF must be specified as an absolute file path"
            lg.error(json1.json_msg(108,[msg],False,files=[pdf_file_spec]))
            sys.exit(msg)
        else:
            png_file_page_number_tuples = pdf.pdfimages(pdf_file_spec,output_dir)
    except Exception as e:
        lg.debug(str(e))
        msg = json1.json_failed_to_convert_pdf(e,pdf_file_spec)
        lg.error(msg)
        lg.info(json1.json_last_log_msg())
        sys.exit(msg)
    else:
        # Is it really import to log png files? (Need to dig them out of tuples...)
        lg.info(json1.json_pdf_to_pngs_success(pdf_file_spec,
                                               None #png_files
        ))
        return png_file_page_number_tuples
Example #3
0
def barcodeScan(imagePNGPath, scan_region):
    """
    Return None if a barcode was not found. If a barcode was found,
    return a string corresponding to the barcode-encoded data.

    Search within the region defined by SCAN_REGION when SCAN_REGION
    is a list. When SCAN_REGION is a list, it specifies two points as
    [x1,y1,x2,y2]. These two points (x1,y1) and (x2,y2) are pairs
    (x,y) of percentages (each expressed as a value between 0 and 1.0)
    relative to the dimensions of the image; they define the box
    within which the barcode scan occurs.

    If SCAN_REGION is not a list, the full image is analyzed. If
    analysis of the full image is desirable, do not set SCAN_REGION to
    [0,0,1,1] but instead set it to None or some other non-list value.
    """
    # sanity check(s)
    if not isinstance(scan_region,list):
        scan_region = None
    else:
        for value in scan_region:
            if (value < 0 or value > 1):
                msg = json1.json_msg(999,"insane scan region value",False,None)
                lg.error(msg)
                lg.info(json1.json_last_log_msg())
                sys.exit(msg)
    # obtain image data either via PIL or CV2/numpy
    #   1. using pil
    # PIL origin (0,0) is top left corner
    pil = Image.open(imagePNGPath).convert('L') # 'L' is "black and white mode": converts to 8-bit pixels B/W
    #   2. using cv2/numpy
    #pil_1 = Image.open(imagePNGPath)
    #frame = pil_1.convert("RGB")
    #pil_gray = cv2.cvtColor(numpy.array(frame), cv2.COLOR_BGR2GRAY, dstCn=0)
    #pil = Image.fromarray(pil_gray)
    pilCropped = pil
    width, height = pil.size
    lg.debug("width: %s height: %s",width,height)
    if scan_region:
        # relative (percentage) values between 0 and 1
        x_crop_min = min(scan_region[0],scan_region[2])
        x_crop_max = max(scan_region[0],scan_region[2])
        y_crop_min = min(scan_region[1],scan_region[3])
        y_crop_max = max(scan_region[1],scan_region[3])
        cropTop=int(height*y_crop_min)
        cropBottom=int(height*y_crop_max)
        cropLeft=int(height*x_crop_min)
        cropRight=int(height*x_crop_max)
        # crop box is 4-tuple: left,upper,right,lower
        pilCropBox = [cropLeft,cropTop,cropRight,cropBottom]
        pilCropped = pil.crop(pilCropBox)
    #  zbar sometimes catches a barcode at a lower resolution but misses it at a higher resolution. Scan for barcode with several variants of image specified by IMAGE_FILE_SPEC.
    barcodeString = barcode_scan_at_resolutions(pilCropped,None)
    if ( not barcodeString ):
            lg.warn(json1.json_barcode_not_found_msg([imagePNGPath],""))
    return barcodeString
Example #4
0
def module_sanity_check (module_name,exitp):
    """MODULE_NAME is a string"""
    # check for presence of module which might not be installed/accessible
    try:
        imp.find_module(module_name)
    except ImportError:
        msg = json1.json_msg_module_not_accessible(module_name)
        lg.error(msg)
        lg.info(json1.json_last_log_msg())
        if exitp:
            sys.exit(msg)
Example #5
0
def executable_sanity_checks (executables):
    """
    Check for availability of executables specified in the list of strings EXECUTABLES.
    """
    lg.debug(util)
    for executable_spec in executables:
        if not util.which(executable_spec):
            msg = json1.json_msg_executable_not_accessible(executable_spec)
            lg.error(msg)
            lg.info(json1.json_last_log_msg())
            sys.exit(msg)
Example #6
0
def signal_handler(signal, frame):
    # Ensure receipt of signal is logged prior to terminating
    msg = json1.json_exit_on_external_request_msg()
    lg.error(msg)
    lg.info(json1.json_last_log_msg())
    sys.exit()
Example #7
0
def main():
    """Handle command-line invocation of pdfxcb.py."""
    global lg
    parser = argparse.ArgumentParser(description="This is pdfxcb")
    parser.add_argument("-f",
                        help="absolute path to log file",
                        action="store",
                        dest="log_file",
                        type=str)
    parser.add_argument("-d",
                        help="absolute path to output directory",
                        action="store",
                        dest="output_dir",
                        type=str)
    parser.add_argument("-m",
                        help="match barcodes to regex (ignore if no match)",
                        action="store",
                        dest="match_re_string",
                        type=str)
    parser.add_argument("-p",
                        help="identifier for a specific instance of pdfxcb",
                        action="store",
                        dest="identifier",
                        type=str)
    parser.add_argument("-l",
                        help="integer between 0 (verbose) and 51 (terse) defining logging",
                        action="store",
                        dest="log_level",
                        type=int)
    #parser.add_argument('-v', '--version', action='version', version=version.version)
    parser.add_argument("input_files", help="an input (PDF) file",
                        # keep nargs as we may want to accept multiple PDFs as input at some point
                        nargs=1,
                        type=str)
    args = parser.parse_args()
    #
    # define logging (level, file, message format, ...)
    #
    log_level = args.log_level
    if isinstance(log_level, int) and log_level >= 0 and log_level <= 51:
        log_level = log_level
    else:
        # since this function doesn't necessarily exit quickly
        log_level = logging.INFO
    if args.log_file:
        logfile = args.log_file
    else:
        logfile = 'busca.log'
    json_log_format = '%(message)s'
    for handler in lg.getLogger().handlers:
        lg.getLogger().removeHandler(handler)
    formatter = logging.Formatter(json_log_format)
    # sanity check for existence of log file directory
    if (os.path.dirname(logfile) and
        not os.path.exists(os.path.dirname(logfile))):
        raise Exception(str.format("log file directory {0} not present",
                                   os.path.dirname(logfile)))
    file_handler = logging.FileHandler(logfile,'w')
    file_handler.setFormatter(formatter)
    lg.getLogger().addHandler(file_handler)
    lg.getLogger().setLevel(log_level)
    lg.debug("args: %s", args)
    lg.debug("sys.argv: %s",sys.argv)
    if args.identifier:
        identifier = args.identifier
    else:
        identifier = str(uuid.uuid1())
    # 1000[0-9][0-9][0-9]$ matches on tt user id
    match_re_string = args.match_re_string
    lg.debug(match_re_string)
    match_re = None
    if match_re_string:
        match_re = re.compile(match_re_string)
    pdf_file_spec = args.input_files[0]
    lg.debug(pdf_file_spec)
    lg.info(json1.json_first_log_msg(identifier, files = [pdf_file_spec] ))
    rasterize_p = False
    # generic debugging
    lg.debug(os.getcwd())         # current/working directory
    # might also want to import platform to get architecture, other details...
    try:
        pdfxcb(pdf_file_spec,args.output_dir,match_re,rasterize_p)
    except Exception as e:
        lg.error("Crash and burn")
        lg.error(sys.exc_info()[0])
        raise
    lg.info(json1.json_last_log_msg())
Example #8
0
def file_sanity_check (file,exitp):
    if not os.path.isfile(file):
        lg.error(json1.json_file_not_found(file))
        lg.info(json1.json_last_log_msg())
        if exitp:
            sys.exit("File " + file + " not found.")
Example #9
0
def directory_sanity_check (directory_spec,exitp):
    if not os.path.isdir(directory_spec):
        lg.error(json1.json_file_not_found(directory_spec))
        lg.info(json1.json_last_log_msg())
        if exitp:
            sys.exit("Directory " + directory_spec + " not found.")
Example #10
0
# bubbles.py, deskew.py, feature_detect.py, and util.py use numpy
# try:
#     imp.find_module('numpy')
# except ImportError:
#     msg = json1.json_msg_module_not_accessible('numpy')
#     lg.error(msg)
#     lg.info(json1.json_last_log_msg())
#     sys.exit(msg)

try:
    imp.find_module('PyPDF2')
except ImportError:
    msg = json1.json_msg_module_not_accessible('PyPDF2')
    lg.error(msg)
    lg.info(json1.json_last_log_msg())
    sys.exit(msg)
import PyPDF2


# handle external signals requesting termination
def signal_handler(signal, frame):
    # Ensure receipt of signal is logged prior to terminating
    msg = json1.json_exit_on_external_request_msg()
    lg.error(msg)
    lg.info(json1.json_last_log_msg())
    sys.exit()

signal.signal(signal.SIGHUP, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)