def main(args): """Split filename pdf by num pages and save an image of each pdf page.""" filename = args[0] split_path = args[1] num = int(args[2]) try: # check that all pages are divisible pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) total_pages = pdfReader.numPages if (total_pages % num != 0): print(filename + " not divisible by " + str(num)) sys.exit(1) # recalculate the total # of pages for each file pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) total_pages = pdfReader.numPages max_length = len(str(total_pages - num)) i = 0 os.chdir(split_path) while i < total_pages: cover_writer = PdfFileWriter() cover_writer.addPage(pdfReader.getPage(i)) prepended_index = str(i).zfill(max_length) cover_filename = '{}_{}_cover.pdf'.format(filename[:-4], prepended_index) output_filename = '{}_{}.pdf'.format(filename[:-4], prepended_index) pdf_writer = PdfFileWriter() start = i for j in range(start, start+num): pdf_writer.addPage(pdfReader.getPage(j)) i += 1 with open(output_filename, 'wb') as out: pdf_writer.write(out) # save pdfs as images pdf_images = convert_from_bytes(open(output_filename, 'rb').read()) for k in range(len(pdf_images)): pdf_images[k].save('{}.jpg'.format(output_filename[:-4]), "JPEG", quality=100) with open(cover_filename, 'wb') as out: cover_writer.write(out) # save cover as image pdf_images = convert_from_bytes(open(cover_filename, 'rb').read()) pdf_images[0].save('{}.jpg'.format(cover_filename[:-4]), "JPEG", quality=100) except Exception: traceback.print_exc()
def test_conversion_from_bytes_using_dir_241(self): # pragma: no cover start_time = time.time() with TemporaryDirectory() as path: with open("./tests/test_241.pdf", "rb") as pdf_file: images_from_bytes = convert_from_bytes( pdf_file.read(), output_folder=path ) self.assertTrue(len(images_from_bytes) == 241) [im.close() for im in images_from_bytes] print( "test_conversion_from_bytes_using_dir_241: {} sec".format( (time.time() - start_time) / 241.0 ) )
def get_text(file): from pdf2image import convert_from_bytes import requests images = convert_from_bytes(file.read()) ocr_text = "" for temp_image in images: temp_image.save("temp.jpeg") json_file = ocr_azure() lines = json_file['analyzeResult']['readResults'][0]['lines'] for line in lines: ocr_text = ocr_text + line['text'] + " " ocr_text += "\n\n" return ocr_text
def _try_rasterisation(self): """ Convert all pages to images and feeds them to QR code finder This is used as last resort when usual PDF parsing wasn't able to find any codes """ from pdf2image import convert_from_bytes qrcodes = set() self._pdf_binary.seek(0) images = convert_from_bytes(self._pdf_binary.read()) for image in images: this_page_codes = self.parse_qr_code(image) if this_page_codes: qrcodes = qrcodes.union(this_page_codes) return qrcodes
def test_conversion_from_bytes_using_dir_14_first_page_2_last_page_12( self): start_time = time.time() with TemporaryDirectory() as path: with open('./tests/test_14.pdf', 'rb') as pdf_file: images_from_bytes = convert_from_bytes(pdf_file.read(), output_folder=path, first_page=2, last_page=12) self.assertTrue(len(images_from_bytes) == 11) [im.close() for im in images_from_bytes] print( 'test_conversion_from_bytes_using_dir_14_first_page_2_last_page_12: {} sec' .format((time.time() - start_time) / 14.))
def extract_pdf(): data = request.files['file'] if (data.filename == ''): return render_template('index.html', msg='Your uploaded file is null!!') extension = data.filename.split('.') if (extension[-1].lower() == 'pdf'): print('PDF') page = convert_from_bytes(data.read()) full_filename = os.path.join(app.config['UPLOAD_FOLDER'], extension[0] + '.jpg') img = page[0] img.save(full_filename, 'JPEG') else: print('Images') img = Image.open(data) filename = secure_filename(data.filename) full_filename = os.path.join(app.config['UPLOAD_FOLDER'], filename) img.save(full_filename, 'JPEG') form_type_img = img.crop((1064, 180, 1353, 243)) form_medicine_img = img.crop((855, 181, 976, 234)) form_type = pytesseract.image_to_string(form_type_img) form_medicine = pytesseract.image_to_string(form_medicine_img) if (form_type.lower().find('ankylosing spondylitis') != -1 and form_medicine.lower().find('infliximab') != -1): print('Proceed to form OCR and allow edit') ocr_res = extract_anky(img) else: return render_template( 'index.html', msg='Currently we did not support this form type!!') return render_template( 'extraction.html', image_name=full_filename, weight=ocr_res[0], date=ocr_res[1], init_bas=ocr_res[2], curr_bas=ocr_res[3], init_ga=ocr_res[4], curr_ga=ocr_res[5], additional=ocr_res[6], )
def print_image_from_tab(tab, payload_data, ppi): loading_counter = [] def request_will_be_sent(**kwargs): logger.debug(f"载入:{kwargs['request']['url']}") loading_counter.append(True) tab.Network.requestWillBeSent = request_will_be_sent def loading_finished(**kwargs): logger.debug('完成载入') loading_counter.pop() tab.Network.loadingFinished = loading_finished tab.Network.setCookie(name='payload_data', value=payload_data, url=config.INT_BASE_URL) tab.Page.navigate(url=f'{config.INT_BASE_URL}/internal/index.html') success_count = 0 begin_time = time.time() while time.time() - begin_time < config.LOAD_TIME_LIMIT: tab.wait(0.1) ready_state = tab.Runtime.evaluate( expression="document.readyState")['result']['value'] logger.debug(f'页面状态:{ready_state}') if ready_state == 'complete' and not loading_counter: success_count += 1 if success_count == 10: break else: success_count = 0 layout_metrics = tab.Page.getLayoutMetrics() content_size = layout_metrics['contentSize'] pdf_data = b64decode( tab.Page.printToPDF(paperWidth=content_size['width'] / 96, paperHeight=content_size['height'] / 96, marginTop=0, marginBottom=0, marginLeft=0, marginRight=0, printBackground=True)['data']) return convert_from_bytes(pdf_data, single_file=True)[0]
def test_close_tempfile_after_conversion(self): start_time = time.time() with open('./tests/test.pdf', 'rb') as pdf_file: fd_count_before = len(subprocess.check_output(['ls', '-l', '/proc/' + str(os.getpid()) + '/fd']).decode('utf8').split('\n')) pdf_data = pdf_file.read() images_from_bytes = [] for i in range(50): images_from_bytes.extend(convert_from_bytes(pdf_data)) # Closing the images [im.close() for im in images_from_bytes] pid = os.getpid() fd_count_after = len(subprocess.check_output(['ls', '-l', '/proc/' + str(os.getpid()) + '/fd']).decode('utf8').split('\n')) # Add an error margin self.assertTrue(abs(fd_count_before - fd_count_after) <= 3) print('test_close_tempfile_after_conversion: {} sec'.format((time.time() - start_time)))
def _search_document_pdf(self): self.ensure_one() records = [] try: images = pdf2image.convert_from_bytes(base64.b64decode(self.datas)) except ( PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError ) as e: buff = StringIO() traceback.print_exc(file=buff) _logger.warning(buff.getvalue()) raise OCRException(str(e)) for im in images: records += self._search_pil_image(im) return records
async def on_message(message): # we do not want the bot to reply to itself if message.author == client.user: return if message.content.startswith('!hello'): msg = 'Hello {0.author.mention}'.format(message) await client.send_message(message.channel, msg) if message.content.endswith('.pdf') and 'arxiv' in message.content: pdf = requests.get(message.content) screenshot = pdf2image.convert_from_bytes(pdf.content)[0] screenshot.save("screenshot.png", filename="screenshot.png") await client.send_file(message.channel, "screenshot.png")
def convert_to_imgs(pdf_path): logger.info("Converting PDF to Images") with open(pdf_path, 'rb') as f: content = f.read() logger.info(content[:1500]) folder_path = '/tmp/' file_names = pdf2image.convert_from_bytes(content, dpi=500, poppler_path='poppler_binaries/', output_folder=folder_path, fmt='JPEG', paths_only=True) logger.info(f'PDFs are {glob.glob(folder_path+"*.pdf")}') logger.info(f'Images are {file_names}') return file_names
def _process_pdf(self, file_string): file_string = file_string.split('base64,')[-1].strip() pic = io.StringIO() images = convert_from_bytes(base64.b64decode(file_string)) ocr_str = "" for image in images: # bg = Image.new("RGB", image.size, (255,255,255)) # bg.paste(image,image) ocr_str += pytesseract.image_to_string(image) return ocr_str
def convert(filename, file): output_path = f"./data/{filename}.tif" images = convert_from_bytes(file) images = [i.convert("1") for i in images] images[0].save( output_path, format='TIFF', dpi=(400, 400), compression="group4", save_all=True, append_images=images[1:]) with open(output_path, "rb") as f: tiff_data = b64encode(f.read()) os.remove(output_path) return tiff_data
def image_from_bytes(pdf, page=1): try: pages = convert_from_bytes(pdf, first_page=page, last_page=page) except (PDFPageCountError, PDFSyntaxError): return None if len(pages) != 1: return None buffer = BytesIO() pages[0].thumbnail((400, 400), Image.ANTIALIAS) pages[0].save(fp=buffer, format='JPEG') pillow_image = ContentFile(buffer.getvalue()) return pillow_image
def convert(): images = convert_from_bytes( base64.b64decode(request.json['pdf_file']), poppler_path='poppler_binaries/') return_dict = {'images': []} for img in images: imgArr = io.BytesIO() img.save(imgArr, format='jpeg') return_dict['images'].append( base64.b64encode(imgArr.getvalue()).decode('ascii') ) return json.dumps(return_dict)
def pdf2IMG(pdf, path=None): if (path == None): pages = pdf2image.convert_from_bytes(pdf) else: pages = convert_from_path(path) imgs = [] for page in pages: img_file_object = io.BytesIO() page.save(img_file_object, 'JPEG') img_file_object.seek(0) img_base64 = convert2base64(img_file_object.getvalue()) imgs.append('data:image/jpeg;base64,%s' % img_base64) return imgs
def dlp_redation(request): request_json = request.get_json() if request_json: source_url = request_json['source_url'] print("source_url=" + source_url) from sep_blob_bucket import regex_ #module for seperating bucket name and blob name from cloud storage bucket from storage_download import download_blob import re import os from pdf2image import convert_from_path, convert_from_bytes from redaction import redact_ from image2pdf import pdf_conv import PyPDF2 pdfWriter = PyPDF2.PdfFileWriter() os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "C:\gcp_credentials\elaborate-howl-285701-105c2e8355a8.json" link = regex_( source_url ) #"gs://context_primary/Forms/NotProcessed/DD2875_AUG_2009_wh (1).pdf" #print(link) #link name bucket_and_blob = re.split('[+]', link) bucket_name = bucket_and_blob[0] #bucket name in gcs blob_name = bucket_and_blob[1] #blob name in gcs print(bucket_name) print(blob_name) pdf_as_bytes = download_blob( bucket_name, blob_name).download_as_bytes() #downloading pdf as bytes #print(pdf_as_bytes) images = convert_from_bytes(pdf_as_bytes) for x in range(0, len(images)): output_file_name = "page" + str(x) + '.jpg' converted_pdf2image = images[x].save(output_file_name, 'JPEG') #saving pdfs in directory #pdf_conv(redact_(output_file_name,'elaborate-howl-285701')) pdf1File = open( pdf_conv(redact_(output_file_name, 'elaborate-howl-285701')), 'rb') pdf1Reader = PyPDF2.PdfFileReader(pdf1File) for pageNum in range(pdf1Reader.numPages): pageObj = pdf1Reader.getPage(pageNum) pdfWriter.addPage(pageObj) pdfOutputFile = open('MergedFiles.pdf', 'wb') pdfWriter.write(pdfOutputFile) pdfOutputFile.close() pdf1File.close() return ("success")
def load_pdf(fname=None, img_list=None, **kwargs): """ Given an path to a pdf or a bytes object return the pages as a list of png byte arrays :type fname: str, bytes :param fname: Path to the pdf or Byte array of a pre-loaded pdf :type dpi: int :param dpi: -> Image quality in DPI :type fmt: str :param fmt: Output image format :type jpegopt: dict :param jpegopt: jpeg options (only for jpeg format) { `quality`: 0-100, `progressive`: "y" OR "n", `optimize`: "y" OR "n" } first_page -> First page to process last_page -> Last page to process before stopping thread_count -> How many threads we are allowed to spawn for processing userpw -> PDF's password use_cropbox -> Use cropbox instead of mediabox transparent -> Output with a transparent background instead of a white one. poppler_path -> Path to look for poppler binaries grayscale -> Output grayscale image(s) """ fmt = kwargs.get("fmt", "bmp") kwargs["fmt"] = fmt # convert pdf into a list of PIL images if isinstance(fname, str): pil_imgs = convert_from_path(fname, **kwargs) elif isinstance(fname, bytes): pil_imgs = convert_from_bytes(fname, **kwargs) else: raise (ValueError(f"Unsuported type: {type(fname)}")) pages = [] for i in pil_imgs: imgByteArr = io.BytesIO() i.save(imgByteArr, format=fmt) pages.append(imgByteArr.getvalue()) return pages, pil_imgs
def _search_document_pdf(self, datas): if self.env.context.get("document_quick_access_reject_file"): return [] if self.env.context.get("force_object_process"): return [self.env.context["force_object_process"]] records = [] try: images = pdf2image.convert_from_bytes(base64.b64decode(datas)) except (PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError) as e: buff = StringIO() traceback.print_exc(file=buff) _logger.warning(buff.getvalue()) raise UserError(str(e)) for im in images: records += self._search_pil_image(im) return records
def snapshot_pdf_images(pdf, sample_dir, update_snapshot): # https://github.com/Belval/pdf2image # https://gist.github.com/santiago-kai/9a18ffabbc49bc2518c695cc140e0214 sample_path_pattern = "{}sample_page_{}.jpg" generated_path_pattern = "{}generated_page_{}.jpg" images = convert_from_bytes(pdf) if update_snapshot: for i, image in enumerate(images): image.save(sample_path_pattern.format(sample_dir, i + 1)) for i, image in enumerate(images): image.save(generated_path_pattern.format(sample_dir, i + 1)) sample_image = Image.open(sample_path_pattern.format( sample_dir, i + 1)) # Reopen to have consistent data bytes to bytes (depends of compression used when saving to file). generated_image = Image.open( generated_path_pattern.format(sample_dir, i + 1))
def decode_b64_to_img(b64, file_type): try: b64_data = base64.decodestring(bytes(b64, 'utf8')) except ValueError: return if file_type == 'pdf': img_list = convert_from_bytes(bytes(b64_data)) return np.asarray(img_list[0]) if file_type == 'img': file_like = BytesIO(b64_data) decoded_img = imdecode(np.frombuffer(file_like.getbuffer(), np.uint8), -1) return decoded_img return None
def create_pdf_images(p, f, b=False): """Create images from PDF. Create jpg images from either a PDF file or bytestream. Inputs: p: File to convert f: Folder to put jpg images in b: Bytes IO (Boolean) Output: List of PIL images of the pages """ if b == False: return convert_from_path(p + ".pdf", output_folder=f, fmt='jpg') else: return convert_from_bytes(p, output_folder=f, fmt='jpg')
def convert_pdf_to_jpeg(pdf: typing.Union[str, typing.IO[bytes]], preview_size: ImgDims) -> BytesIO: pdf = pdf.read() images = convert_from_bytes(pdf) output = BytesIO() for image in images: resize_dims = compute_resize_dims(ImgDims(image.width, image.height), preview_size) resized = image.resize((resize_dims.width, resize_dims.height), resample=True) resized.save(output, format="JPEG") output.seek(0, 0) return output
def process_financial_document(file_path=None, url=None, pdf_bytes=None, show_logs=None): """""" if show_logs: logging.getLogger().setLevel(logging.INFO) logging.info("Beginning Extraction of Financial Document") if not file_path and not url and not pdf_bytes: logging.warning( "\n\n--> No file, url or pdf_bytes submitted<--\n--> Exiting early\n" ) return if file_path: logging.info("Opening PDF document from path") pdf_bytes = open(file_path, "rb").read() if url: logging.info("Downloading PDF from URL") pdf_bytes = requests.get(url, stream=True).content # Turn the PDF into an array of images pages = convert_from_bytes(pdf_bytes) page_total = len(pages) logging.info("Document is %s pages long" % page_total) logging.info("Determining document structure") try: document_structure, check_count = extract_contours_from_page(pages) except: return {"success": False, "msg": CheckboxesNotFound} if check_count < 8: logging.warning("Failed to extract document structure") return {"success": False, "msg": "Failed to process document properly"} logging.info("Extracting content from financial disclosure") results = process_document(document_structure, pages, show_logs) results["page_count"] = page_total results["pdf_size"] = len(pdf_bytes) results["wealth"] = estimate_investment_net_worth(results) results["success"] = True return results
def pdf_to_images(pdfs): pages = convert_from_bytes(pdfs.read(), 500) x_data = [] i = 1 for page in pages: image_name = "Page_" + str(i) + ".jpg" page.save(image_name, "JPEG") image = cv2.imread(image_name) x_data.append(image) i = i + 1 text = [] text = text_generation(x_data) i = 1 for page in pages: os.remove("Page_" + str(i) + ".jpg") i = i + 1 return text
def get(self, request, *args, **kwargs): context = self.get_context_data(**kwargs) if self.pdf_or_png == "pdf": return self.render_to_response(context) elif self.pdf_or_png == "png": pdf = render_to_pdf("schedule.html", context=context, request=request, **kwargs) img = convert_from_bytes(pdf)[0] response = HttpResponse(content_type='image/png') response[ 'Content-Disposition'] = 'attachment; filename=ITUscheduler' img.save(response, "PNG") return response else: return HttpResponseRedirect("/")
def ocr(self, pdf_bin, search_model): if not search_model: search_model = "" res_data = {} try: images = convert_from_bytes(pdf_bin) except: _logger.warning('Failed to convert pdf') return None text = '' for im in images: try: text += pytesseract.image_to_string(im, lang='spa') except: _logger.warning('Failed to OCR') return None for rex in self.search([('re_model', 'ilike', search_model)]): res = re.search(rex.re_exp, text, re.MULTILINE | re.DOTALL | re.IGNORECASE) if res: if not rex.re_model in res_data: res_data[rex.re_model] = {} if rex.re_field not in res_data[rex.re_model]: if rex.re_searchModel: reA = rex.re_searchString.split(',') rec = self.env[rex.re_searchModel].search([ (reA[0], reA[1], reA[2]) ]) if rec and len(rec) > 0: res_data[rex.re_model][rex.re_field] = rec[0].id else: if rex.re_value: value = rex.re_value else: value = res.group(1) if rex.re_field == 'unit_amount': value = float(value.replace(',', '.')) if value < 2: _logger.warning( 'value too small: %s. regex: %s' % (value, rex.re_exp)) res_data[rex.re_model][rex.re_field] = value if search_model not in res_data: res_data[search_model] = {} res_data[search_model]['description'] = text return res_data
def pdf_to_png(self): if not self.gcp: pages = convert_from_path(self.in_path, 500) if len(pages) == 1: pages[0].save(self.out_path + ".jpg", "JPEG") else: for page, n in enumerate(pages): page[0].save(self.out_path + "_" + n + ".jpg", "JPEG") elif self.gcp: with self.fs.open(self.in_path, 'rb') as f: pages = convert_from_bytes(f.read(), 500) if len(pages) == 1: byteio = BytesIO() pages[0].save(byteio, 'JPEG') self.bs_image = byteio else: print("Balance sheet must be one page only")
def test_conversion_from_bytes_using_dir_single_file(self): start_time = time.time() with TemporaryDirectory() as path: with open("./tests/test.pdf", "rb") as pdf_file: images_from_bytes = convert_from_bytes( pdf_file.read(), output_folder=path, output_file="test", single_file=True, ) self.assertTrue(len(images_from_bytes) == 1) self.assertTrue(images_from_bytes[0].filename == os.path.join( path, "test.ppm")) [im.close() for im in images_from_bytes] print( "test_conversion_from_bytes_using_dir_single_file: {} sec".format( time.time() - start_time))
def convert(event, context): """Takes a dictionary containing a pdffile as base64 :return: A dictionary containing a list of images as base64 """ images = pdf2image.convert_from_bytes(base64.b64decode(event['pdf_file']), poppler_path='poppler_binaries/') return_dict = {'images': []} for img in images: imgArr = io.BytesIO() img.save(imgArr, format='jpeg') return_dict['images'].append( base64.b64encode(imgArr.getvalue()).decode('ascii')) return return_dict
def convert_pdf(pdf_byte, user_id): images = convert_from_bytes(pdf_byte, fmt="jpeg") print(len(images)) current_uuid = uuid.uuid4() list_location = [] for num, image in enumerate(images): image_name = 'temp/{}/{}/output-{}.jpg'.format( user_id, current_uuid, num) if not os.path.exists(os.path.dirname(image_name)): try: os.makedirs(os.path.dirname(image_name)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise image.save(image_name, format='JPEG') list_location.append(image_name) return list_location
def main(args): """Scan through PDF and split PDF and images.""" filename = args[0] split_path = args[1] qr_prefix = args[2] qr_suffix = args[3] try: os.chdir(split_path) pdfPages = PdfFileReader(filename) pdf_writer = PdfFileWriter() i = cover_index = id_index = 0 page_count = 1 prev_file = '' data = [] output = {} for page_number in range(pdfPages.numPages): # convert pdf to series of images for scanning page = convert_from_bytes( open(filename, 'rb').read(), first_page=page_number+1, last_page=page_number+2)[0] # increase contrast of image for better QR decoding cv_img = numpy.array(page) mask = cv2.inRange(cv_img, (0, 0, 0), (200, 200, 200)) inverted = 255 - cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR) # decode img - only look for QR codes val = pyzbar.decode(inverted, symbols=[ZBarSymbol.QRCODE]) if val != []: # found a new qr code, split here # convert byte literal to string data = val[0][0].decode("utf-8") if data == "none": # blank exam with 'none' qr code data = "BLANK EXAM" else: pre = data[0:len(qr_prefix)] suf = data[(len(data)-len(qr_suffix)):len(data)] if qr_prefix != '' and pre == qr_prefix: data = data[len(qr_prefix):] if qr_suffix != '' and suf == qr_suffix: data = data[:-len(qr_suffix)] cover_index = i cover_filename = '{}_{}_cover.pdf'.format(filename[:-4], i) output_filename = '{}_{}.pdf'.format(filename[:-4], cover_index) output[output_filename] = {} output[output_filename]['id'] = data # save pdf if i != 0 and prev_file != '': output[prev_file]['page_count'] = page_count with open(prev_file, 'wb') as out: pdf_writer.write(out) page.save('{}.jpg'.format(prev_file[:-4]), "JPEG", quality=100) if id_index == 1: # correct first pdf's page count and print file output[prev_file]['page_count'] = page_count with open(prev_file, 'wb') as out: pdf_writer.write(out) page.save('{}.jpg'.format(prev_file[:-4]), "JPEG", quality=100) # start a new pdf and grab the cover cover_writer = PdfFileWriter() pdf_writer = PdfFileWriter() cover_writer.addPage(pdfPages.getPage(i)) pdf_writer.addPage(pdfPages.getPage(i)) # save cover with open(cover_filename, 'wb') as out: cover_writer.write(out) # save cover image page.save('{}.jpg'.format(cover_filename[:-4]), "JPEG", quality=100) id_index += 1 page_count = 1 prev_file = output_filename else: # add pages to current split_pdf page_count += 1 pdf_writer.addPage(pdfPages.getPage(i)) i += 1 # save whatever is left output_filename = '{}_{}.pdf'.format(filename[:-4], cover_index) output[output_filename]['id'] = data output[output_filename]['page_count'] = page_count with open(output_filename, 'wb') as out: pdf_writer.write(out) if not os.path.exists('decoded.json'): # write json to file for parsing page counts and decoded ids later with open('decoded.json', 'w') as out: json.dump(output, out, sort_keys=True, indent=4) else: with open('decoded.json') as file: prev_data = json.load(file) prev_data.update(output) with open('decoded.json', 'w') as out: json.dump(prev_data, out) # remove original, unsplit file os.remove(filename) except Exception: print("\nbulk_qr_split.py: Failed when splitting pdf " + str(filename)) traceback.print_exc() sys.exit(1)