def main(args):
    """Split filename pdf by num pages and save an image of each pdf page."""
    filename = args[0]
    split_path = args[1]
    num = int(args[2])
    try:
        # check that all pages are divisible
        pdfFileObj = open(filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        total_pages = pdfReader.numPages
        if (total_pages % num != 0):
            print(filename + " not divisible by " + str(num))
            sys.exit(1)

        # recalculate the total # of pages for each file
        pdfFileObj = open(filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        total_pages = pdfReader.numPages
        max_length = len(str(total_pages - num))

        i = 0
        os.chdir(split_path)
        while i < total_pages:
            cover_writer = PdfFileWriter()
            cover_writer.addPage(pdfReader.getPage(i))
            prepended_index = str(i).zfill(max_length)
            cover_filename = '{}_{}_cover.pdf'.format(filename[:-4], prepended_index)
            output_filename = '{}_{}.pdf'.format(filename[:-4], prepended_index)
            pdf_writer = PdfFileWriter()
            start = i
            for j in range(start, start+num):
                pdf_writer.addPage(pdfReader.getPage(j))
                i += 1
            with open(output_filename, 'wb') as out:
                pdf_writer.write(out)

            # save pdfs as images
            pdf_images = convert_from_bytes(open(output_filename, 'rb').read())
            for k in range(len(pdf_images)):
                pdf_images[k].save('{}.jpg'.format(output_filename[:-4]),
                                   "JPEG", quality=100)

            with open(cover_filename, 'wb') as out:
                cover_writer.write(out)

            # save cover as image
            pdf_images = convert_from_bytes(open(cover_filename, 'rb').read())
            pdf_images[0].save('{}.jpg'.format(cover_filename[:-4]),
                               "JPEG", quality=100)
    except Exception:
        traceback.print_exc()
Exemple #2
0
 def test_conversion_from_bytes_using_dir_241(self):  # pragma: no cover
     start_time = time.time()
     with TemporaryDirectory() as path:
         with open("./tests/test_241.pdf", "rb") as pdf_file:
             images_from_bytes = convert_from_bytes(
                 pdf_file.read(), output_folder=path
             )
             self.assertTrue(len(images_from_bytes) == 241)
             [im.close() for im in images_from_bytes]
     print(
         "test_conversion_from_bytes_using_dir_241: {} sec".format(
             (time.time() - start_time) / 241.0
         )
     )
Exemple #3
0
def get_text(file):
    from pdf2image import convert_from_bytes
    import requests

    images = convert_from_bytes(file.read())
    ocr_text = ""
    for temp_image in images:
        temp_image.save("temp.jpeg")
        json_file = ocr_azure()
        lines = json_file['analyzeResult']['readResults'][0]['lines']
        for line in lines:
            ocr_text = ocr_text + line['text'] + " "
        ocr_text += "\n\n"
    return ocr_text
 def _try_rasterisation(self):
     """
     Convert all pages to images and feeds them to QR code finder
     This is used as last resort when usual PDF parsing wasn't able to find any codes
     """
     from pdf2image import convert_from_bytes
     qrcodes = set()
     self._pdf_binary.seek(0)
     images = convert_from_bytes(self._pdf_binary.read())
     for image in images:
         this_page_codes = self.parse_qr_code(image)
         if this_page_codes:
             qrcodes = qrcodes.union(this_page_codes)
     return qrcodes
Exemple #5
0
 def test_conversion_from_bytes_using_dir_14_first_page_2_last_page_12(
         self):
     start_time = time.time()
     with TemporaryDirectory() as path:
         with open('./tests/test_14.pdf', 'rb') as pdf_file:
             images_from_bytes = convert_from_bytes(pdf_file.read(),
                                                    output_folder=path,
                                                    first_page=2,
                                                    last_page=12)
             self.assertTrue(len(images_from_bytes) == 11)
             [im.close() for im in images_from_bytes]
     print(
         'test_conversion_from_bytes_using_dir_14_first_page_2_last_page_12: {} sec'
         .format((time.time() - start_time) / 14.))
Exemple #6
0
def extract_pdf():
    data = request.files['file']

    if (data.filename == ''):
        return render_template('index.html',
                               msg='Your uploaded file is null!!')

    extension = data.filename.split('.')

    if (extension[-1].lower() == 'pdf'):
        print('PDF')
        page = convert_from_bytes(data.read())
        full_filename = os.path.join(app.config['UPLOAD_FOLDER'],
                                     extension[0] + '.jpg')
        img = page[0]
        img.save(full_filename, 'JPEG')

    else:
        print('Images')
        img = Image.open(data)
        filename = secure_filename(data.filename)
        full_filename = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        img.save(full_filename, 'JPEG')

    form_type_img = img.crop((1064, 180, 1353, 243))
    form_medicine_img = img.crop((855, 181, 976, 234))

    form_type = pytesseract.image_to_string(form_type_img)
    form_medicine = pytesseract.image_to_string(form_medicine_img)

    if (form_type.lower().find('ankylosing spondylitis') != -1
            and form_medicine.lower().find('infliximab') != -1):
        print('Proceed to form OCR and allow edit')
        ocr_res = extract_anky(img)

    else:
        return render_template(
            'index.html', msg='Currently we did not support this form type!!')

    return render_template(
        'extraction.html',
        image_name=full_filename,
        weight=ocr_res[0],
        date=ocr_res[1],
        init_bas=ocr_res[2],
        curr_bas=ocr_res[3],
        init_ga=ocr_res[4],
        curr_ga=ocr_res[5],
        additional=ocr_res[6],
    )
Exemple #7
0
def print_image_from_tab(tab, payload_data, ppi):

    loading_counter = []

    def request_will_be_sent(**kwargs):
        logger.debug(f"载入:{kwargs['request']['url']}")
        loading_counter.append(True)

    tab.Network.requestWillBeSent = request_will_be_sent

    def loading_finished(**kwargs):
        logger.debug('完成载入')
        loading_counter.pop()

    tab.Network.loadingFinished = loading_finished

    tab.Network.setCookie(name='payload_data',
                          value=payload_data,
                          url=config.INT_BASE_URL)

    tab.Page.navigate(url=f'{config.INT_BASE_URL}/internal/index.html')

    success_count = 0
    begin_time = time.time()
    while time.time() - begin_time < config.LOAD_TIME_LIMIT:
        tab.wait(0.1)
        ready_state = tab.Runtime.evaluate(
            expression="document.readyState")['result']['value']
        logger.debug(f'页面状态:{ready_state}')
        if ready_state == 'complete' and not loading_counter:
            success_count += 1
            if success_count == 10:
                break
        else:
            success_count = 0

    layout_metrics = tab.Page.getLayoutMetrics()
    content_size = layout_metrics['contentSize']

    pdf_data = b64decode(
        tab.Page.printToPDF(paperWidth=content_size['width'] / 96,
                            paperHeight=content_size['height'] / 96,
                            marginTop=0,
                            marginBottom=0,
                            marginLeft=0,
                            marginRight=0,
                            printBackground=True)['data'])

    return convert_from_bytes(pdf_data, single_file=True)[0]
Exemple #8
0
 def test_close_tempfile_after_conversion(self):
     start_time = time.time()
     with open('./tests/test.pdf', 'rb') as pdf_file:
         fd_count_before = len(subprocess.check_output(['ls', '-l', '/proc/' + str(os.getpid()) + '/fd']).decode('utf8').split('\n'))
         pdf_data = pdf_file.read()
         images_from_bytes = []
         for i in range(50):
             images_from_bytes.extend(convert_from_bytes(pdf_data))
         # Closing the images
         [im.close() for im in images_from_bytes]
         pid = os.getpid()
         fd_count_after = len(subprocess.check_output(['ls', '-l', '/proc/' + str(os.getpid()) + '/fd']).decode('utf8').split('\n'))
         # Add an error margin
         self.assertTrue(abs(fd_count_before - fd_count_after) <= 3)
     print('test_close_tempfile_after_conversion: {} sec'.format((time.time() - start_time)))
 def _search_document_pdf(self):
     self.ensure_one()
     records = []
     try:
         images = pdf2image.convert_from_bytes(base64.b64decode(self.datas))
     except (
         PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError
     ) as e:
         buff = StringIO()
         traceback.print_exc(file=buff)
         _logger.warning(buff.getvalue())
         raise OCRException(str(e))
     for im in images:
         records += self._search_pil_image(im)
     return records
Exemple #10
0
async def on_message(message):
    # we do not want the bot to reply to itself
    if message.author == client.user:
        return

    if message.content.startswith('!hello'):
        msg = 'Hello {0.author.mention}'.format(message)
        await client.send_message(message.channel, msg)

    if message.content.endswith('.pdf') and 'arxiv' in message.content:
        pdf = requests.get(message.content)
        screenshot = pdf2image.convert_from_bytes(pdf.content)[0]
        screenshot.save("screenshot.png", filename="screenshot.png")

        await client.send_file(message.channel, "screenshot.png")
Exemple #11
0
def convert_to_imgs(pdf_path):
    logger.info("Converting PDF to Images")
    with open(pdf_path, 'rb') as f:
        content = f.read()
    logger.info(content[:1500])
    folder_path = '/tmp/'
    file_names = pdf2image.convert_from_bytes(content,
                                              dpi=500,
                                              poppler_path='poppler_binaries/',
                                              output_folder=folder_path,
                                              fmt='JPEG',
                                              paths_only=True)
    logger.info(f'PDFs are {glob.glob(folder_path+"*.pdf")}')
    logger.info(f'Images are {file_names}')
    return file_names
Exemple #12
0
    def _process_pdf(self, file_string):
        file_string = file_string.split('base64,')[-1].strip()

        pic = io.StringIO()
        images = convert_from_bytes(base64.b64decode(file_string))

        ocr_str = ""
        for image in images:

            # bg = Image.new("RGB", image.size, (255,255,255))
            # bg.paste(image,image)

            ocr_str += pytesseract.image_to_string(image)

        return ocr_str
Exemple #13
0
def convert(filename, file):
    output_path = f"./data/{filename}.tif"
    images = convert_from_bytes(file)
    images = [i.convert("1") for i in images]
    images[0].save(
        output_path,
        format='TIFF',
        dpi=(400, 400),
        compression="group4",
        save_all=True,
        append_images=images[1:])
    with open(output_path, "rb") as f:
        tiff_data = b64encode(f.read())
    os.remove(output_path)
    return tiff_data
Exemple #14
0
    def image_from_bytes(pdf, page=1):
        try:
            pages = convert_from_bytes(pdf, first_page=page, last_page=page)
        except (PDFPageCountError, PDFSyntaxError):
            return None

        if len(pages) != 1:
            return None

        buffer = BytesIO()
        pages[0].thumbnail((400, 400), Image.ANTIALIAS)
        pages[0].save(fp=buffer, format='JPEG')

        pillow_image = ContentFile(buffer.getvalue())
        return pillow_image
def convert():
    images = convert_from_bytes(
        base64.b64decode(request.json['pdf_file']),
        poppler_path='poppler_binaries/')

    return_dict = {'images': []}

    for img in images:
        imgArr = io.BytesIO()
        img.save(imgArr, format='jpeg')
        return_dict['images'].append(
            base64.b64encode(imgArr.getvalue()).decode('ascii')
        )

    return json.dumps(return_dict)
Exemple #16
0
def pdf2IMG(pdf, path=None):
    if (path == None):
        pages = pdf2image.convert_from_bytes(pdf)
    else:
        pages = convert_from_path(path)

    imgs = []
    for page in pages:
        img_file_object = io.BytesIO()
        page.save(img_file_object, 'JPEG')
        img_file_object.seek(0)
        img_base64 = convert2base64(img_file_object.getvalue())
        imgs.append('data:image/jpeg;base64,%s' % img_base64)

    return imgs
Exemple #17
0
def dlp_redation(request):
    request_json = request.get_json()
    if request_json:

        source_url = request_json['source_url']
        print("source_url=" + source_url)
    from sep_blob_bucket import regex_  #module for seperating bucket name and blob name from cloud storage bucket
    from storage_download import download_blob
    import re
    import os
    from pdf2image import convert_from_path, convert_from_bytes
    from redaction import redact_
    from image2pdf import pdf_conv
    import PyPDF2
    pdfWriter = PyPDF2.PdfFileWriter()

    os.environ[
        "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\gcp_credentials\elaborate-howl-285701-105c2e8355a8.json"
    link = regex_(
        source_url
    )  #"gs://context_primary/Forms/NotProcessed/DD2875_AUG_2009_wh (1).pdf"
    #print(link) #link name
    bucket_and_blob = re.split('[+]', link)
    bucket_name = bucket_and_blob[0]  #bucket name in gcs
    blob_name = bucket_and_blob[1]  #blob name in gcs
    print(bucket_name)
    print(blob_name)

    pdf_as_bytes = download_blob(
        bucket_name, blob_name).download_as_bytes()  #downloading pdf as bytes
    #print(pdf_as_bytes)
    images = convert_from_bytes(pdf_as_bytes)
    for x in range(0, len(images)):
        output_file_name = "page" + str(x) + '.jpg'
        converted_pdf2image = images[x].save(output_file_name,
                                             'JPEG')  #saving pdfs in directory
        #pdf_conv(redact_(output_file_name,'elaborate-howl-285701'))
        pdf1File = open(
            pdf_conv(redact_(output_file_name, 'elaborate-howl-285701')), 'rb')
        pdf1Reader = PyPDF2.PdfFileReader(pdf1File)
        for pageNum in range(pdf1Reader.numPages):
            pageObj = pdf1Reader.getPage(pageNum)
            pdfWriter.addPage(pageObj)
    pdfOutputFile = open('MergedFiles.pdf', 'wb')
    pdfWriter.write(pdfOutputFile)
    pdfOutputFile.close()
    pdf1File.close()
    return ("success")
def load_pdf(fname=None, img_list=None, **kwargs):
    """
    Given an path to a pdf or a bytes object
    return the pages as a list of png byte arrays

    :type fname: str, bytes
    :param fname:  Path to the pdf or  Byte array of a pre-loaded pdf

    :type dpi: int
    :param dpi: -> Image quality in DPI

    :type fmt: str
    :param fmt: Output image format

    :type jpegopt: dict
    :param jpegopt: jpeg options (only for jpeg format)
        {
        `quality`: 0-100, 
        `progressive`: "y" OR "n",
        `optimize`: "y" OR "n"
        }
    
    first_page -> First page to process
    last_page -> Last page to process before stopping

    thread_count -> How many threads we are allowed to spawn for processing
    userpw -> PDF's password
    use_cropbox -> Use cropbox instead of mediabox
    transparent -> Output with a transparent background instead of a white one.
    poppler_path -> Path to look for poppler binaries
    grayscale -> Output grayscale image(s)
    """
    fmt = kwargs.get("fmt", "bmp")
    kwargs["fmt"] = fmt
    # convert pdf into a list of PIL images
    if isinstance(fname, str):
        pil_imgs = convert_from_path(fname, **kwargs)
    elif isinstance(fname, bytes):
        pil_imgs = convert_from_bytes(fname, **kwargs)
    else:
        raise (ValueError(f"Unsuported type: {type(fname)}"))

    pages = []
    for i in pil_imgs:
        imgByteArr = io.BytesIO()
        i.save(imgByteArr, format=fmt)
        pages.append(imgByteArr.getvalue())
    return pages, pil_imgs
Exemple #19
0
 def _search_document_pdf(self, datas):
     if self.env.context.get("document_quick_access_reject_file"):
         return []
     if self.env.context.get("force_object_process"):
         return [self.env.context["force_object_process"]]
     records = []
     try:
         images = pdf2image.convert_from_bytes(base64.b64decode(datas))
     except (PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError) as e:
         buff = StringIO()
         traceback.print_exc(file=buff)
         _logger.warning(buff.getvalue())
         raise UserError(str(e))
     for im in images:
         records += self._search_pil_image(im)
     return records
Exemple #20
0
def snapshot_pdf_images(pdf, sample_dir, update_snapshot):
    # https://github.com/Belval/pdf2image
    # https://gist.github.com/santiago-kai/9a18ffabbc49bc2518c695cc140e0214
    sample_path_pattern = "{}sample_page_{}.jpg"
    generated_path_pattern = "{}generated_page_{}.jpg"
    images = convert_from_bytes(pdf)
    if update_snapshot:
        for i, image in enumerate(images):
            image.save(sample_path_pattern.format(sample_dir, i + 1))
    for i, image in enumerate(images):
        image.save(generated_path_pattern.format(sample_dir, i + 1))
        sample_image = Image.open(sample_path_pattern.format(
            sample_dir, i + 1))
        # Reopen to have consistent data bytes to bytes (depends of compression used when saving to file).
        generated_image = Image.open(
            generated_path_pattern.format(sample_dir, i + 1))
Exemple #21
0
def decode_b64_to_img(b64, file_type):

    try:
        b64_data = base64.decodestring(bytes(b64, 'utf8'))
    except ValueError:
        return
    if file_type == 'pdf':
        img_list = convert_from_bytes(bytes(b64_data))
        return np.asarray(img_list[0])

    if file_type == 'img':
        file_like = BytesIO(b64_data)
        decoded_img = imdecode(np.frombuffer(file_like.getbuffer(), np.uint8), -1)
        return decoded_img

    return None
Exemple #22
0
def create_pdf_images(p, f, b=False):
    """Create images from PDF.

    Create jpg images from either a PDF file or bytestream.
    Inputs:
        p: File to convert
        f: Folder to put jpg images in
        b: Bytes IO (Boolean)
    Output:
        List of PIL images of the pages

    """
    if b == False:
        return convert_from_path(p + ".pdf", output_folder=f, fmt='jpg')
    else:
        return convert_from_bytes(p, output_folder=f, fmt='jpg')
Exemple #23
0
def convert_pdf_to_jpeg(pdf: typing.Union[str, typing.IO[bytes]],
                        preview_size: ImgDims) -> BytesIO:

    pdf = pdf.read()
    images = convert_from_bytes(pdf)

    output = BytesIO()
    for image in images:
        resize_dims = compute_resize_dims(ImgDims(image.width, image.height),
                                          preview_size)
        resized = image.resize((resize_dims.width, resize_dims.height),
                               resample=True)
        resized.save(output, format="JPEG")

    output.seek(0, 0)
    return output
def process_financial_document(file_path=None,
                               url=None,
                               pdf_bytes=None,
                               show_logs=None):
    """"""
    if show_logs:
        logging.getLogger().setLevel(logging.INFO)

    logging.info("Beginning Extraction of Financial Document")

    if not file_path and not url and not pdf_bytes:
        logging.warning(
            "\n\n--> No file, url or pdf_bytes submitted<--\n--> Exiting early\n"
        )
        return

    if file_path:
        logging.info("Opening PDF document from path")
        pdf_bytes = open(file_path, "rb").read()
    if url:
        logging.info("Downloading PDF from URL")
        pdf_bytes = requests.get(url, stream=True).content

    # Turn the PDF into an array of images
    pages = convert_from_bytes(pdf_bytes)
    page_total = len(pages)
    logging.info("Document is %s pages long" % page_total)

    logging.info("Determining document structure")
    try:
        document_structure, check_count = extract_contours_from_page(pages)
    except:
        return {"success": False, "msg": CheckboxesNotFound}

    if check_count < 8:
        logging.warning("Failed to extract document structure")
        return {"success": False, "msg": "Failed to process document properly"}

    logging.info("Extracting content from financial disclosure")
    results = process_document(document_structure, pages, show_logs)
    results["page_count"] = page_total
    results["pdf_size"] = len(pdf_bytes)

    results["wealth"] = estimate_investment_net_worth(results)

    results["success"] = True
    return results
Exemple #25
0
def pdf_to_images(pdfs):
    pages = convert_from_bytes(pdfs.read(), 500)
    x_data = []
    i = 1
    for page in pages:
        image_name = "Page_" + str(i) + ".jpg"
        page.save(image_name, "JPEG")
        image = cv2.imread(image_name)
        x_data.append(image)
        i = i + 1
    text = []
    text = text_generation(x_data)
    i = 1
    for page in pages:
        os.remove("Page_" + str(i) + ".jpg")
        i = i + 1
    return text
Exemple #26
0
 def get(self, request, *args, **kwargs):
     context = self.get_context_data(**kwargs)
     if self.pdf_or_png == "pdf":
         return self.render_to_response(context)
     elif self.pdf_or_png == "png":
         pdf = render_to_pdf("schedule.html",
                             context=context,
                             request=request,
                             **kwargs)
         img = convert_from_bytes(pdf)[0]
         response = HttpResponse(content_type='image/png')
         response[
             'Content-Disposition'] = 'attachment; filename=ITUscheduler'
         img.save(response, "PNG")
         return response
     else:
         return HttpResponseRedirect("/")
Exemple #27
0
 def ocr(self, pdf_bin, search_model):
     if not search_model:
         search_model = ""
     res_data = {}
     try:
         images = convert_from_bytes(pdf_bin)
     except:
         _logger.warning('Failed to convert pdf')
         return None
     text = ''
     for im in images:
         try:
             text += pytesseract.image_to_string(im, lang='spa')
         except:
             _logger.warning('Failed to OCR')
             return None
     for rex in self.search([('re_model', 'ilike', search_model)]):
         res = re.search(rex.re_exp, text,
                         re.MULTILINE | re.DOTALL | re.IGNORECASE)
         if res:
             if not rex.re_model in res_data:
                 res_data[rex.re_model] = {}
             if rex.re_field not in res_data[rex.re_model]:
                 if rex.re_searchModel:
                     reA = rex.re_searchString.split(',')
                     rec = self.env[rex.re_searchModel].search([
                         (reA[0], reA[1], reA[2])
                     ])
                     if rec and len(rec) > 0:
                         res_data[rex.re_model][rex.re_field] = rec[0].id
                 else:
                     if rex.re_value:
                         value = rex.re_value
                     else:
                         value = res.group(1)
                     if rex.re_field == 'unit_amount':
                         value = float(value.replace(',', '.'))
                         if value < 2:
                             _logger.warning(
                                 'value too small: %s. regex: %s' %
                                 (value, rex.re_exp))
                     res_data[rex.re_model][rex.re_field] = value
     if search_model not in res_data:
         res_data[search_model] = {}
     res_data[search_model]['description'] = text
     return res_data
 def pdf_to_png(self):
     if not self.gcp:
         pages = convert_from_path(self.in_path, 500)
         if len(pages) == 1:
             pages[0].save(self.out_path + ".jpg", "JPEG")
         else:
             for page, n in enumerate(pages):
                 page[0].save(self.out_path + "_" + n + ".jpg", "JPEG")
     elif self.gcp:
         with self.fs.open(self.in_path, 'rb') as f:
             pages = convert_from_bytes(f.read(), 500)
         if len(pages) == 1:
             byteio = BytesIO()
             pages[0].save(byteio, 'JPEG')
             self.bs_image = byteio
         else:
             print("Balance sheet must be one page only")
Exemple #29
0
 def test_conversion_from_bytes_using_dir_single_file(self):
     start_time = time.time()
     with TemporaryDirectory() as path:
         with open("./tests/test.pdf", "rb") as pdf_file:
             images_from_bytes = convert_from_bytes(
                 pdf_file.read(),
                 output_folder=path,
                 output_file="test",
                 single_file=True,
             )
             self.assertTrue(len(images_from_bytes) == 1)
             self.assertTrue(images_from_bytes[0].filename == os.path.join(
                 path, "test.ppm"))
             [im.close() for im in images_from_bytes]
     print(
         "test_conversion_from_bytes_using_dir_single_file: {} sec".format(
             time.time() - start_time))
def convert(event, context):
    """Takes a dictionary containing a pdffile as base64
    :return: A dictionary containing a list of images as base64
    """

    images = pdf2image.convert_from_bytes(base64.b64decode(event['pdf_file']),
                                          poppler_path='poppler_binaries/')

    return_dict = {'images': []}

    for img in images:
        imgArr = io.BytesIO()
        img.save(imgArr, format='jpeg')
        return_dict['images'].append(
            base64.b64encode(imgArr.getvalue()).decode('ascii'))

    return return_dict
def convert_pdf(pdf_byte, user_id):
    images = convert_from_bytes(pdf_byte, fmt="jpeg")
    print(len(images))
    current_uuid = uuid.uuid4()
    list_location = []
    for num, image in enumerate(images):
        image_name = 'temp/{}/{}/output-{}.jpg'.format(
            user_id, current_uuid, num)
        if not os.path.exists(os.path.dirname(image_name)):
            try:
                os.makedirs(os.path.dirname(image_name))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        image.save(image_name, format='JPEG')
        list_location.append(image_name)
    return list_location
Exemple #32
0
def main(args):
    """Scan through PDF and split PDF and images."""
    filename = args[0]
    split_path = args[1]
    qr_prefix = args[2]
    qr_suffix = args[3]
    try:
        os.chdir(split_path)
        pdfPages = PdfFileReader(filename)
        pdf_writer = PdfFileWriter()
        i = cover_index = id_index = 0
        page_count = 1
        prev_file = ''
        data = []
        output = {}
        for page_number in range(pdfPages.numPages):
            # convert pdf to series of images for scanning
            page = convert_from_bytes(
                open(filename, 'rb').read(),
                first_page=page_number+1, last_page=page_number+2)[0]
            # increase contrast of image for better QR decoding
            cv_img = numpy.array(page)
            mask = cv2.inRange(cv_img, (0, 0, 0), (200, 200, 200))
            inverted = 255 - cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
            # decode img - only look for QR codes
            val = pyzbar.decode(inverted, symbols=[ZBarSymbol.QRCODE])
            if val != []:
                # found a new qr code, split here
                # convert byte literal to string
                data = val[0][0].decode("utf-8")
                if data == "none":  # blank exam with 'none' qr code
                    data = "BLANK EXAM"
                else:
                    pre = data[0:len(qr_prefix)]
                    suf = data[(len(data)-len(qr_suffix)):len(data)]
                    if qr_prefix != '' and pre == qr_prefix:
                        data = data[len(qr_prefix):]
                    if qr_suffix != '' and suf == qr_suffix:
                        data = data[:-len(qr_suffix)]
                cover_index = i
                cover_filename = '{}_{}_cover.pdf'.format(filename[:-4], i)
                output_filename = '{}_{}.pdf'.format(filename[:-4], cover_index)

                output[output_filename] = {}
                output[output_filename]['id'] = data
                # save pdf
                if i != 0 and prev_file != '':
                    output[prev_file]['page_count'] = page_count
                    with open(prev_file, 'wb') as out:
                        pdf_writer.write(out)

                    page.save('{}.jpg'.format(prev_file[:-4]), "JPEG", quality=100)

                if id_index == 1:
                    # correct first pdf's page count and print file
                    output[prev_file]['page_count'] = page_count
                    with open(prev_file, 'wb') as out:
                        pdf_writer.write(out)

                    page.save('{}.jpg'.format(prev_file[:-4]), "JPEG", quality=100)

                # start a new pdf and grab the cover
                cover_writer = PdfFileWriter()
                pdf_writer = PdfFileWriter()
                cover_writer.addPage(pdfPages.getPage(i))
                pdf_writer.addPage(pdfPages.getPage(i))

                # save cover
                with open(cover_filename, 'wb') as out:
                    cover_writer.write(out)

                # save cover image
                page.save('{}.jpg'.format(cover_filename[:-4]), "JPEG", quality=100)

                id_index += 1
                page_count = 1
                prev_file = output_filename
            else:
                # add pages to current split_pdf
                page_count += 1
                pdf_writer.addPage(pdfPages.getPage(i))
            i += 1

        # save whatever is left
        output_filename = '{}_{}.pdf'.format(filename[:-4], cover_index)
        output[output_filename]['id'] = data
        output[output_filename]['page_count'] = page_count

        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)

        if not os.path.exists('decoded.json'):
            # write json to file for parsing page counts and decoded ids later
            with open('decoded.json', 'w') as out:
                json.dump(output, out, sort_keys=True, indent=4)
        else:
            with open('decoded.json') as file:
                prev_data = json.load(file)

            prev_data.update(output)

            with open('decoded.json', 'w') as out:
                json.dump(prev_data, out)

        # remove original, unsplit file
        os.remove(filename)
    except Exception:
        print("\nbulk_qr_split.py: Failed when splitting pdf " + str(filename))
        traceback.print_exc()
        sys.exit(1)