def parse_state(state): data = [] if state in [ x.split("_US_", 1)[1].split("_Mobility", 1)[0] for x in glob.glob("mobilityData/US/*.pdf") ]: document = fitz.Document( f"mobilityDataPDF/US/{date}_US_{state}_Mobility_Report_en.pdf") for i in range(2, document.pageCount - 1): for entry in parse_page(document, i): entry["state"] = state entry["page"] = i data.append(entry) df = pd.DataFrame(data) return df[[ "state", "county", "category", "change", "changecalc", "dates", "values", "page" ]] else: document = fitz.Document( f"mobilityDataPDF/2020-04-11_{state}_Mobility_Report_en.pdf") if document.pageCount < 4: return pd.DataFrame(data) for i in range(2, document.pageCount - 1): for entry in parse_page(document, i): entry["country"] = state entry["page"] = i data.append(entry) df = pd.DataFrame(data) return df[[ "country", "county", "category", "change", "changecalc", "dates", "values", "page" ]]
def merge_pdfs(outfile, pdf_files): """Merges given PDF files into one PDF file. :param pdf_files: PDF files to merge :param outfile: Merged PDF file """ result_pdf = fitz.Document() print("reading input pdf files...", flush=True) # reads pdf files for pdf_file in pdf_files: with fitz.Document(pdf_file) as pdf_doc: result_pdf.insertPDF(pdf_doc) print("pdf files have been read") # creates directories needed to write the outfile (if needed) # os.path.normpath() used to turn "" (empty string) directory path to "." # manually checking for "" and turning into "." would also work # https://bugs.python.org/issue33968 os.makedirs(os.path.normpath(os.path.dirname(outfile)), exist_ok=True) print(f"saving the merged pdf document into: {outfile}") print(f" ({os.path.abspath(outfile)})") result_pdf.save(outfile)
def process(pdf_dir, out_dir): """合并pdf""" out_pdf = fitz.Document() files_path = files_in_folder(pdf_dir, filter_extend=['pdf']) for file_path in files_path: Message.info(f'开始提取:{file_path}') with fitz.Document(file_path) as now_pdf: out_pdf.insert_pdf(now_pdf) out_pdf_path = os.path.join(out_dir, f"PDF合并文件-{uuid.uuid1()}.pdf") out_pdf.save(out_pdf_path)
def image_pdf(file_dir): dir_name, base_name = get_dir_name(file_dir) doc = fitz.Document() for img in os.listdir(file_dir): # 排序获得对象 img = file_dir + os.sep + img img_doc = fitz.Document(img) # 获得图片对象 pdf_bytes = img_doc.convertToPDF() # 获得图片流对象 img_pdf = fitz.Document("pdf", pdf_bytes) # 将图片流创建单个的PDF文件 doc.insertPDF(img_pdf) # 将单个文件插入到文档 img_doc.close() img_pdf.close() doc.save(dir_name + os.sep + base_name + ".pdf") # 保存文档 doc.close()
def image_pdf(file_dir): dir_name, base_name = get_dir_name(file_dir) doc = fitz.Document() for img in sorted(glob.glob(file_dir + '\\*'), key=os.path.getmtime): # 排序获得对象 img_doc = fitz.Document(img) # 获得图片对象 pdf_bytes = img_doc.convertToPDF() # 获得图片流对象 img_pdf = fitz.Document("pdf", pdf_bytes) # 将图片流创建单个的PDF文件 doc.insertPDF(img_pdf) # 将单个文件插入到文档 img_doc.close() img_pdf.close() doc.save(dir_name + os.sep + base_name + ".pdf") # 保存文档 doc.close() messagebox.showinfo('提示', '转换成功!')
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option, resources, outpdf): input_file = resources / 'toc.pdf' before_toc = fitz.Document(str(input_file)).getToC() check_ocrmypdf( input_file, outpdf, ocr_option, '--output-type', output_type, env=spoof_tesseract_noop) after_toc = fitz.Document(str(outpdf)).getToC() print(before_toc) print(after_toc) assert before_toc == after_toc
def check_docs_keywords(folder_name): download_docs(folder_name, create_list_links(folder_name)) list_files = glob.glob(f"/LOTS/{folder_name}/*.*") def find_keywords(text): for keywords_key in list(keywords_files.keys()): for keywords in keywords_files.get(keywords_key): if keywords in text: return keywords_key for file in list_files: if "pdf" in file: pdf = fitz.Document(file) i_page = 0 while i_page < pdf.pageCount: page = pdf.loadPage(i_page) page_text = page.getText("text") found_keywords = find_keywords(page_text) if found_keywords: return found_keywords i_page += 1 elif "docx" in file: all_text = docx2txt.process(file) return find_keywords(all_text) elif "doc" in file and "docx" not in file: with open(file) as file_in: with open(f"{file}.txt", "w") as file_out: for line in file_in: file_out.write(line) txt_text = open(f"{file}.txt", encoding="cp1251") doc_text = txt_text.read() return find_keywords(doc_text)
def _conver_img(pdf_path, pdf_save_path, pdf_name): """ 将pdf转化为jpg """ doc = fitz.Document(pdf_save_path) pdf_name_without_ext = pdf_name.split(".")[0] i = 1 jpg_dir = [] for pg in range(doc.pageCount): page = doc[pg] rotate = int(0) # 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。 zoom_x = 2.0 zoom_y = 2.0 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) pm = page.getPixmap(matrix=trans, alpha=False) if platform.system() == "Windows": pm.writePNG(pdf_path + '{0}-{1}.jpg'.format(pdf_name_without_ext, "%04d" % i)) jpg_dir.append('{0}-{1}.jpg'.format(pdf_name_without_ext, "%04d" % i)) else: pm.writePNG(pdf_path + '{0}-{1}.jpg'.format(pdf_name_without_ext, "%04d" % i)) jpg_dir.append('{0}-{1}.jpg'.format(pdf_name_without_ext, "%04d" % i)) i = i + 1 return jpg_dir
def extract_all_lines_slides(filename): dict_keywords = { "Net property income": ["Net property income"], "Distribution per unit": ["Distribution per unit", "DPU"], "Total assets": ["Total assets"], "Total liabilities": ["Total liabilities"], "Total debts": ["Total debts"], "Units": ["Units in issue"], "Net asset value": ["Net asset value", "NAV"], "Gearing": ["Aggregate Leverage", "Gearing"], "Cost of debt": ["Cost of debt"], "Interest cover": ["Interest cover"], "Average term to maturity": ["Average term to maturity"], "WALE": ["WALE", "Weighted average"] } doc = fitz.Document(filename) results = dict() for key, keywords in dict_keywords.items(): res = dict() for keyword in keywords: dct = extract_line_slides(doc, keyword) if dct is not None: res.update(dct) results[key] = res return results
def make_pdf(input_folder, output_path, fname, quiet): output = fitz.Document() non_svgs = [] n = 0 for file_path in os.listdir(input_folder): if file_path.lower().endswith('.svg'): n += 1 im = svg2rlg(os.path.join(input_folder, file_path)) b = renderPDF.drawToString(im) # convert to pdf img_pdf = fitz.open('pdf', b) # open as pdf output.insertPDF(img_pdf) else: non_svgs.append(file_path) if n: try: output.save(output_path) if not quiet: print("Successfully rendered " + str(n) + " SVGs to " + fname) if non_svgs: print("Ignored " + str(len(non_svgs)) + " non-svg files:") for line in non_svgs: print('\t' + line) except: print('Error - something went wrong while saving the file', file=sys.stderr) return 1 else: print('Error - no SVGs in input folder\n', file=sys.stderr) return 1 return 0
def main(): import argparse import sys parser = argparse.ArgumentParser( description='Adds "table of contents" to pdf files.') parser.add_argument("--offset", type=int, default=0, help="site offset when the first chapter starts.") parser.add_argument("input") parser.add_argument("toc", type=argparse.FileType("r")) parser.add_argument("output") args = parser.parse_args() toc = parse_toc(args.toc, args.offset) # write toc doc = fitz.Document(args.input) inserted = doc.setToC(toc) doc.save(args.output) # done print("Done setting {} chapters".format(inserted))
def on_treeWidget_imagenamelist_itemDoubleClicked(self, qtreeitem, p_int): img_id = int(qtreeitem.text(1)) key_dict = {'autoid': img_id} res = self.IC.get_data(1, False, *VALUE_TUPLE_IM, **key_dict) if not len(res): return ext = res[0]['ext'] image = res[0]['img'] if ext.lower() == 'pdf': self.comboBox_jumpto.setVisible(True) self.pushButton_prepage.setVisible(True) self.pushButton_nextpage.setVisible(True) self.current_img = fitz.Document(stream=image, filetype='pdf') page_count = self.current_img.pageCount page_list = [] self.comboBox_jumpto.clear() for i in range(1, page_count + 1): page_list.append('第' + str(i) + '页') self.comboBox_jumpto.addItems(page_list) self.current_page = self.current_img.loadPage(0) else: self.comboBox_jumpto.setVisible(False) self.pushButton_prepage.setVisible(False) self.pushButton_nextpage.setVisible(False) img = QImage.fromData(image) self.current_img = QPixmap.fromImage(img) self.label_image.setPixmap(self.current_img) # 默认放大为3被,同时自动调用on_horizontalSlider_zoom_valueChanged self.horizontalSlider_zoom.setValue(30)
def make_page(self, page:fitz.Page, debug=True): ''' Parse and create single page. If debug=True, illustration pdf will be created during parsing the raw pdf layout. ''' # debug information # fitz object in debug mode: plot page layout # file path for this debug pdf: demo.pdf -> debug_demo.pdf path, filename = os.path.split(self.filename_pdf) filename_json = os.path.join(path, 'layout.json') debug_kwargs = { 'debug' : debug, 'doc' : fitz.Document() if debug else None, 'filename': os.path.join(path, f'debug_{filename}') } # init page layout self.initialize(page) if debug: self._layout.plot(debug_kwargs['doc'], 'Source Text Blocks') self._paths.plot(debug_kwargs['doc'], 'Source Shapes', self._layout.width, self._layout.height) # parse and save page self.layout.parse(**debug_kwargs).make_page(self.doc_docx) self.save() # save debug files if debug: # save layout plotting as pdf file debug_kwargs['doc'].save(debug_kwargs['filename']) # write layout information self.layout.serialize(filename_json) return self
def debug_page(self, i:int, docx_filename:str=None, debug_pdf=None, layout_file=None, config:dict=None): ''' Parse, create and plot single page for debug purpose. --- Args: - i (int): page index to convert - docx_filename (str): DOCX filename to write to - debug_pdf (str): new pdf file storing layout information (add prefix "debug_" by default) - layout_file (str): new json file storing parsed layout data (layout.json by default) ''' config = config if config else {} # include debug information # fitz object in debug mode: plot page layout # file path for this debug pdf: demo.pdf -> debug_demo.pdf path, filename = os.path.split(self.filename_pdf) if not debug_pdf: debug_pdf = os.path.join(path, f'debug_{filename}') if not layout_file: layout_file = os.path.join(path, 'layout.json') config.update({ 'debug' : True, 'debug_doc' : fitz.Document(), 'debug_filename': debug_pdf }) # parse and create docx self.convert(docx_filename, pages=[i], config=config) # layout information for debugging self.serialize(layout_file)
def get_table_area(pdf_data): """This finds a bounding box for the Race, Ethnicity table by looking for bounding boxes for the words "White" and "Total" (occuring below it) on page 3 of the PDF, and the page's right bound. """ doc = fitz.Document(stream=pdf_data, filetype='pdf') page3 = doc[2] # page indexes start at 0 white_bbox = None for (x0, y0, x1, y1, word, block_no, line_no, word_no) in page3.getText('words'): if word == 'White': white_bbox = fitz.Rect(x0, y0, x1, y1) total_bbox = None for (x0, y0, x1, y1, word, block_no, line_no, word_no) in page3.getText('words'): if word == 'Total': if (round(x0) == round(white_bbox.x0) and round(y0) > round(white_bbox.y0)): total_bbox = fitz.Rect(x0, y0, x1, y1) return fitz.Rect(white_bbox.x0, white_bbox.y0, page3.bound().x1, total_bbox.y1)
def _decode_page(page_data): """ Read the image and try to find the QR codes. :param bytes page_data: Data of the PDF single page :returns: decoded qrcode, numpy array of page and test image data to show the detection :rtype: str, binary """ tic = time() doc = fitz.Document("pdf", page_data) # get first page page = next(doc.pages()) zoom = (5.0, 5.0) mat = fitz.Matrix(*zoom) # zoom factor in each dimension # use 'mat' instead of the identity matrix pix = page.get_pixmap(matrix=mat, alpha=0) img_url = os.path.join(os.getcwd(), "page0.png") pix.save(img_url) # store image as a PNG # qr_data = zxing_wrapper.scan_qrcode(img_url, page) # _logger.debug(f"\t\tQRCode decoded using ZXing in {time() - tic:.3} sec") qr_data = zbar_wrapper.scan_qrcode(img_url, page) _logger.debug(f"\t\tQRCode decoded using ZBar in {time() - tic:.3} sec") doc.close() return qr_data, img_url
def __init__(self, pdf_file, debug=False, text_gray=218): super(Walker, self).__init__() self.pdf_file = pdf_file self.pdf = fitz.Document(pdf_file) self.page_count = len(self.pdf) self.DEBUG = debug self.TEXT_GRAY = text_gray
def debug_page(self, page:fitz.Page): ''' Parse, create and plot single page for debug purpose. Illustration pdf will be created during parsing the raw pdf layout. ''' # debug information # fitz object in debug mode: plot page layout # file path for this debug pdf: demo.pdf -> debug_demo.pdf path, filename = os.path.split(self.filename_pdf) filename_json = os.path.join(path, 'layout.json') debug_kwargs = { 'debug' : True, 'doc' : fitz.Document(), 'filename': os.path.join(path, f'debug_{filename}') } # init page layout self.initialize(page) self._layout.plot(**debug_kwargs) self._paths_extractor.paths.plot(debug_kwargs['doc'], 'Source Paths', self._layout.width, self._layout.height) # parse and save debug files self.layout.parse(**debug_kwargs) if len(debug_kwargs['doc']): debug_kwargs['doc'].save(debug_kwargs['filename']) # layout plotting self.layout.serialize(filename_json) # layout information # make docx page self._layout.make_page(self.doc_docx) self.save() return self
def __parse_pdf(self, path, result_dir, **kwargs): doc = fitz.Document(path) pages = kwargs['range'].pages p_layer = self.progress.add_layer((0, len(pages))) for page_number in pages: page = doc.load_page(page_number - 1) self.send_update('Rendering {}-th page of PDF'.format( str(page.number + 1))) try: # page.get_pixmap().writePNG('test.png') scale = 1.25 scale_matrix = fitz.Matrix( scale, scale) # get image 'scale' times larger than page.bound() png = page.get_pixmap(matrix=scale_matrix).getPNGData() png = np.frombuffer(png, dtype=np.int8) self.__parse_img( png, result_dir, file_prefix='page-{}-'.format(str(page.number + 1)), board_title_fmt="Страница {}, доска {{}} из {{}}".format( page.number + 1)) except KeyboardInterrupt: raise except: pass self.progress.append_progress(p_layer, 1) self.progress.pop_layer(p_layer)
def PdfFileRead(self): """ This current code provides a workaround in case MuPDF (a dependency for PyMuPDF) is not usable in the development environment. For such instances, the module relies on PyPDF2 to extract text data. However, because of the likelihood of white spaces being rampant in the extracted string data, those characters get filtered out. """ contents = self.get_contents() try: import fitz pdf_file = fitz.Document(stream=contents, filetype="pdf") raw_text = [ele.get_text("text") for ele in pdf_file] text = "".join(raw_text) # else: except Exception: import PyPDF2 pdf_reader = PyPDF2.PdfFileReader(contents) raw_text = [ele.extractText() for ele in pdf_reader.pages] text = "".join(raw_text) return text
def debug_page(self, i: int, docx_filename: str = None, config: dict = None): ''' Parse, create and plot single page for debug purpose. Illustration pdf will be created during parsing the raw pdf layout. ''' config = config if config else {} # include debug information # fitz object in debug mode: plot page layout # file path for this debug pdf: demo.pdf -> debug_demo.pdf path, filename = os.path.split(self.filename_pdf) filename_json = os.path.join(path, 'layout.json') debug_doc = fitz.Document() config.update({ 'debug': True, 'doc': debug_doc, 'filename': os.path.join(path, f'debug_{filename}') }) # parse and make page layouts = self.make_docx(docx_filename, pages=[i], config=config) # layout information for debugging layouts[0].serialize(filename_json) return layouts[0]
def __init__(self, pdf_file: str): ''' Initialize fitz object with given pdf file path; initialize docx object.''' # pdf/docx filename self.filename_pdf = pdf_file # fitz object to read pdf self._doc_pdf = fitz.Document(pdf_file)
def add_pages(self): # 请求文件 if not self.file: message_label = QLabel('没有文件.') self.page_container.layout().addWidget(message_label) return try: response = requests.get(self.file) doc = fitz.Document(filename='a', stream=response.content) except Exception as e: message_label = QLabel('获取文件内容失败.\n{}'.format(e)) self.page_container.layout().addWidget(message_label) return for page_index in range(doc.pageCount): page = doc.loadPage(page_index) page_label = QLabel() # page_label.setMinimumSize(self.width() - 20, self.height()) # 设置label大小 # show PDF content zoom_matrix = fitz.Matrix(1.5, 1.5) # 图像缩放比例 pagePixmap = page.getPixmap( matrix=zoom_matrix, alpha=False) imageFormat = QImage.Format_RGB888 # get image format pageQImage = QImage( pagePixmap.samples, pagePixmap.width, pagePixmap.height, pagePixmap.stride, imageFormat) # init QImage page_map = QPixmap() page_map.convertFromImage(pageQImage) page_label.setPixmap(page_map) page_label.setScaledContents(True) # pixmap resize with label self.page_container.layout().addWidget(page_label)
def train_from_pdf(bot: Bot, update: Update, conn): try: buffer = update.message.document.get_file().download_as_bytearray() document = fitz.Document(stream=buffer, filetype="pdf") trains = pdf_extraction.extract_info_from_pdf(document, update.effective_chat.id) if not trains: raise Exception(f"Train extraction list was empty :( {trains}") message = f"{_(app_strings.added_train)}" for train in trains: db_utils.insert_train_in_db(train, conn, False) message += get_train_info_message( train, format_date(train.depart_date, check_daily=False, check_interval=""), conn) conn.commit() bot.send_sticker(update.message.from_user.id, stickers.drake_approving) update.message.reply_text(message) except TrainInPastError as e: logging.error(e) bot.send_sticker(update.message.from_user.id, stickers.tom_puzzled) update.message.reply_text(_(app_strings.train_in_past_error)) except Exception as e: logging.error(e) bot.send_sticker(update.effective_chat.id, stickers.blackman_crying) bot.send_message(update.effective_chat.id, _(app_strings.error_pdf))
def debug_page(self, i: int, docx_filename: str = None, debug_pdf: str = None, layout_file: str = None, kwargs: dict = None): '''Parse, create and plot single page for debug purpose. Args: i (int): Page index to convert. docx_filename (str): docx filename to write to. debug_pdf (str): New pdf file storing layout information. Default to add prefix ``debug_``. layout_file (str): New json file storing parsed layout data. Default to ``layout.json``. ''' kwargs = kwargs if kwargs else {} # include debug information # fitz object in debug mode: plot page layout # file path for this debug pdf: demo.pdf -> debug_demo.pdf path, filename = os.path.split(self.filename_pdf) if not debug_pdf: debug_pdf = os.path.join(path, f'debug_{filename}') if not layout_file: layout_file = os.path.join(path, 'layout.json') kwargs.update({ 'debug': True, 'debug_doc': fitz.Document(), 'debug_filename': debug_pdf }) # parse and create docx self.convert(docx_filename, pages=[i], kwargs=kwargs) # layout information for debugging self.serialize(layout_file)
def on_treeWidget_imagenamelist_itemDoubleClicked(self, qtreeitem, p_int): if self.power[1] == '0': return rela_id = int(qtreeitem.text(0)) for item in self.images_list: if item.autoid == rela_id: if item.imgid.ext.lower() == 'pdf': self.comboBox_jumpto.setVisible(True) self.pushButton_prepage.setVisible(True) self.pushButton_nextpage.setVisible(True) self.current_img = fitz.Document(stream=item.imgid.img, filetype='pdf') page_count = self.current_img.pageCount page_list = [] self.comboBox_jumpto.clear() for i in range(1, page_count + 1): page_list.append('第' + str(i) + '页') self.comboBox_jumpto.addItems(page_list) self.current_page = self.current_img.loadPage(0) else: self.comboBox_jumpto.setVisible(False) self.pushButton_prepage.setVisible(False) self.pushButton_nextpage.setVisible(False) img = QImage.fromData(item.imgid.img) self.current_img = QPixmap.fromImage(img) self.label_image.setPixmap(self.current_img) break # 默认放大为3被,同时自动调用on_horizontalSlider_zoom_valueChanged self.horizontalSlider_zoom.setValue(30)
def pdf_image(pdf_name, Gray=False): img_paths = [] pdf = fitz.Document(pdf_name) for i, pg in enumerate(range(0, pdf.pageCount)): page = pdf[pg] # 获得每一页的对象 trans = fitz.Matrix(3.0, 3.0).preRotate(0) pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象 # pm.writePNG(dir_name + os.sep + base_name[:-4] + '_' + '{:0>3d}.png'.format(pg + 1)) # 保存图片 img_path = pdf_name[:-4] + '_' + str(pg + 1) + '.jpg' pm.writePNG(img_path) # 保存图片 img_paths.append(img_path) if Gray: # 是否转为灰度 img = Image.open(img_path) # img.show() low = img.convert('L') low.save(img_path) ''' 这种模式转换的灰度图片size比较大 img = cv2.imread(img_path, 0) # cv2.imshow("img", img) cv2.imwrite(img_path, img) ''' pdf.close() return img_paths
def pdf_format_2(input_file, page_no): doc = fitz.Document(input_file) # page = doc[0] try: page = doc[int(page_no) - 1] contents = page.get_text("blocks") outer_list = [] for content in contents: if 'DECLARACIÓN NUTRIMENTAL' in content[4] and ';' in content[4]: whole_content = content[4].split(';') outer_list = [] for each_content in whole_content: each_content_1 = re.sub(r'^.*?\[', '', each_content) # print(each_content_1,"EACH ") each_content_1 = each_content_1.replace( 'DECLARACIÓN NUTRIMENTAL', '') if ']' in each_content_1: each_content_2 = each_content_1.replace(']', '\n') each_content_3 = each_content_2.strip().split('\n') # print(each_content_3,"EACH 3") outer_list.extend(each_content_3) else: outer_list.extend([each_content_1]) # print(each_content_1,"ELSE") # print('&&&&&&&&&&&&&&&&&&&&&&&&') # print(outer_list) outer_list = [i.strip() for i in outer_list if i != ''] # print(outer_list,"%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") return outer_list except: print("page Num doesn't exist")
def parse_place(place, pdf_path, args): # Actually parses a place from the PDF it is to parse. doc = fitz.Document(pdf_path) data = [] # if not args.no_aggregate: for entry in parse_front_pages(doc): entry["state"] = place entry["page"] = 1 entry["county"] = "Overall" data.append(entry) # if not args.aggregate_only: for i in range(2, doc.pageCount - 1): for entry in parse_page(doc, i): entry["state"] = place entry["page"] = i data.append(entry) # outname = f"data/{place}.json.gz" df = pd.DataFrame(data) if len(df) == 0: return df ncounties = df['county'].nunique() print(f"Parsed {len(df)} plots for {ncounties} counties in {place}") df = df[[ "state", "county", "category", "change", "changecalc", "dates", "values", "page" ]] return df
def main(input: str, output: str, dpi: int, first_page: Optional[int], last_page: Optional[int], ocr: bool, clean: bool): if os.path.splitext(input)[1].lower() == ".pdf": # PDF mode assert os.path.exists(input) page_count = fitz.Document(input).page_count first_page = 0 if first_page is None else first_page - 1 last_page = page_count if last_page is None else last_page args = zip(repeat(input), range(first_page, last_page), repeat(dpi), repeat(ocr), repeat(clean)) else: # Glob mode files = sorted(glob.glob(input, recursive=True)) first_page = 0 if first_page is None else first_page - 1 last_page = len(files) if last_page is None else last_page args = zip(files[first_page:last_page], repeat(0), repeat(-1), repeat(ocr), repeat(clean)) total = last_page - first_page with Pool() as p: results = tqdm(p.imap(clean_single_page, args), total=total) if os.path.splitext(output)[1].lower() == ".pdf": merge_to_pdf(results, output) elif not os.path.exists(output) or os.path.isdir(output): if ocr: raise RuntimeError("the OCR flag is useless because we are " "writing images (not PDF) to the output " "directory.") if not os.path.exists(output): Path(output).mkdir(parents=True) for (index, page) in enumerate(results): file_path = os.path.join(output, f"{index}.jpg") assert isinstance(page, Image.Image) page.save(file_path) else: raise RuntimeError("invalid output format.")