def extract_information(self, pdf_path, link): with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) information = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() try: readable1 = extract_date( information.getText("/CreationDate").split('-')) readable2 = extract_date( information.getText("/ModDate").split('-')) except: readable1 = None readable2 = None info = { "author": information.author, "creator": information.creator, "producer": information.producer, "subject": information.subject, "title": information.title, "creation_date": readable1, "modification_date": readable2, "number_of_pages": number_of_pages, "download_link": link } return pdf_path, info
def merge(self): save_path = save_as_pdf() if not save_path: messagebox.showerror(MESSAGE_TITLE, "You must specify a file save path.") return if save_path[-4:].lower() != ".pdf": save_path += ".pdf" pdf_writer = PdfFileWriter() for item in self.tree.get_children(): item_values = self.tree.item(item, option="values") path = item_values[1] pdf_reader = PdfFileReader(path) if pdf_reader.isEncrypted and not decrypt(pdf_reader, MESSAGE_TITLE): messagebox.showwarning(MESSAGE_TITLE, f"{item_values[0]} could not be decrypted. It will not be " f"included in the merge.") continue for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) with Path(save_path).open(mode="wb") as save_file: pdf_writer.write(save_file) messagebox.showinfo(MESSAGE_TITLE, "PDF Merged")
def get_info(input_file: str): """ Extracting the file info """ # If PDF is encrypted the file metadata cannot be extracted with open(input_file, 'rb') as pdf_file: pdf_reader = PdfFileReader(pdf_file, strict=False) output = { "File": input_file, "Encrypted": ("True" if pdf_reader.isEncrypted else "False") } if not pdf_reader.isEncrypted: info = pdf_reader.getDocumentInfo() num_pages = pdf_reader.getNumPages() output["Author"] = info.author output["Creator"] = info.creator output["Producer"] = info.producer output["Subject"] = info.subject output["Title"] = info.title output["Number of pages"] = num_pages # To Display collected metadata print( "## File Information ##################################################" ) print("\n".join("{}:{}".format(i, j) for i, j in output.items())) print( "######################################################################" ) return True, output
def test_merge_pdf_output(self): image_paths = [ 'tests/pdf_samples/jpeg_w_350.jpg', 'tests/pdf_samples/pdf_sample_A Sample PDF_loremIpsum_pages_01.pdf', 'tests/pdf_samples/pdf_sample_b_pages_01.pdf', 'tests/pdf_samples/pdf_sample_dummy_w3c_pages_01.pdf', 'tests/pdf_samples/pdf_sample_googledocs_image_pages_02.pdf', ## the next PDF fail to read - invalid literal for int() with base 10: b'F-1.4' !!! # 'tests/pdf_samples/pdf_sample_googlesheet_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_ISO19005_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_format_FDF_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_hibrid_format_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_not_hybrid_ISO19005_pages_02.pdf', 'tests/pdf_samples/pdf_sample_pages_01.pdf', ('tests/pdf_samples/pdf_sample_readthedocs_pdf_networkdays_pages_019.pdf', (0, 2)), 'tests/pdf_samples/pdf_sample_text_edit_macos_pages_01.pdf', 'tests/pdf_samples/pdf_sample_wikimedia_org_pages_01.pdf', 'tests/pdf_samples/sample_pdf_commandline_xhtml2pdf_generated_pages_01.pdf', 'tests/pdf_samples/issue_repo_pypdf4.pdf', 'tests/pdf_samples/issue_repo_pypdf4_test.pdf', ] m = MergeToPdf(paths_list=image_paths, output_file_path='test_merged_pdf.pdf') m.merge_pdfs() with open('test_merged_pdf.pdf', "rb") as outputfile: generated_pdf = PdfFileReader(outputfile) pages = generated_pdf.getNumPages() self.assertEqual(pages, 23)
def put_watermark(input_pdf, output_pdf, watermark): # reads the watermark pdf file through # PdfFileReader watermark_instance = PdfFileReader(watermark) # fetches the respective page of # watermark(1st page) watermark_page = watermark_instance.getPage(0) # reads the input pdf file pdf_reader = PdfFileReader(input_pdf) # It creates a pdf writer object for the # output file pdf_writer = PdfFileWriter() # iterates through the original pdf to # merge watermarks for page in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(page) # will overlay the watermark_page on top # of the current page. page.mergePage(watermark_page) # add that newly merged page to the # pdf_writer object. pdf_writer.addPage(page) with open(output_pdf, 'wb') as out: # writes to the respective output_pdf provided pdf_writer.write(out)
def main(): parser = argparse.ArgumentParser( prog='ca6fix', description= "Fix some disappointmented points in Computer Architecture Quantitative Approach 6th Edition Japanese translation PDF file.", usage='ca6fix -i ca6.pdf -o ca6_fixed.pdf', add_help=True) parser.add_argument('-i', '--input', help='input PDF file', required=True) parser.add_argument('-o', '--output', help='output PDF file', required=True) args = parser.parse_args() reader = PdfFileReader(args.input) writer = PdfFileWriter() for p in range(reader.getNumPages()): page = reader.getPage(p) writer.addPage(page) writer.insertBlankPage(None, None, 4) for index in outline: add_outline(writer, index, 21) writer.setPageLayout('/TwoPageRight') writer.addMetadata({ '/Title': 'コンピュータアーキテクチャ 定量的アプローチ[第6版]', '/Author': 'ジョン・L・ヘネシー, デイビッド・A・パターソン(著), 中條拓伯, 天野英晴, 鈴木 貢(訳)' }) with open(args.output, 'wb') as fh: writer.write(fh)
def add_watermark(file_path, file_stage, fileno): """把水印添加到pdf中""" pdf_input = PdfFileReader(file_path) if pdf_input.isEncrypted: return pdf_info = pdf_input.getDocumentInfo() w, h = pdf_input.getPage(0).mediaBox[2:] # 页面尺寸转换为毫米 page = (int(w) * 0.3528, int(h) * 0.3528) # 创建水印文件 #mark = str(int(time.time()))+'.pdf' #mark = os.path.join('d:/', mark) mark = create_watermark(page, file_stage, fileno) # 读入水印pdf文件 pageNum = pdf_input.getNumPages() pdf_output = PdfFileWriter() pdf_watermark = PdfFileReader(open(mark, 'rb'), strict=False) for i in range(pageNum): page = pdf_input.getPage(i) page.mergePage(pdf_watermark.getPage(0)) pdf_output.addPage(page) #print('merg 结束...' + str(datetime.now())) # 加密码 pdf_output.encrypt(user_pwd='', owner_pwd='12345', use_128bit=True) pdf_output.addMetadata(pdf_info) pdf_output.write(open(file_path, 'wb'))
def remove_watermark(wm_text, inputFile, outputFile): from PyPDF4 import PdfFileReader, PdfFileWriter from PyPDF4.pdf import ContentStream from PyPDF4.generic import TextStringObject, NameObject from PyPDF4.utils import b_ with open(inputFile, "rb") as f: source = PdfFileReader(f, "rb") output = PdfFileWriter() for page in range(source.getNumPages()): page = source.getPage(page) content_object = page["/Contents"].getObject() content = ContentStream(content_object, source) for operands, operator in content.operations: if operator == b_("Tj"): text = operands[0] if isinstance(text, str) and text.startswith(wm_text): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) output.addPage(page) with open(outputFile, "wb") as outputStream: output.write(outputStream)
def _merge_documents_PyPDF4(self, file_name, paths): """ Merge documents. """ output = settings.SAVE_PATH / file_name try: pdf_writer = PdfFileWriter() for file_path in paths: if file_path: pdf_reader = PdfFileReader(str(file_path), strict=False) for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF output = settings.SAVE_PATH / file_name with open(output, 'wb') as out: pdf_writer.write(out) return output except utils.PdfReadError as error: LogHandler.execution_log(error=error) LogHandler.execution_log( error=f'ERROR ON: {output.name.replace(".PDF", "")}') return output
def unwatermark_pdf(input_file: str, wm_text: str, pages: Tuple = None): """ Removes watermark from the pdf file. """ pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False) pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): # If required for specific pages if pages: if str(page) not in pages: continue page = pdf_reader.getPage(page) # Get the page content content_object = page["/Contents"].getObject() content = ContentStream(content_object, pdf_reader) # Loop through all the elements page elements for operands, operator in content.operations: # Checks the TJ operator and replaces the corresponding string operand (Watermark text) with '' if operator == b_("Tj"): text = operands[0] if isinstance(text, str) and text.startswith(wm_text): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) pdf_writer.addPage(page) return True, pdf_reader, pdf_writer
def encrypt_file(self): path = self.file_selector.getpath() if not path: messagebox.showerror(MESSAGE_TITLE, "You must select a PDF file.") return if not self.password.get(): messagebox.showerror(MESSAGE_TITLE, "You must enter a password.") return pdf_reader = PdfFileReader(path) if pdf_reader.isEncrypted: messagebox.showwarning(MESSAGE_TITLE, "File is already encrypted.") return pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(self.password.get()) save_path = save_as_pdf(parent=self) if not save_path: messagebox.showerror(MESSAGE_TITLE, "You must specify a file save path") if save_path[-4:].lower() != ".pdf": save_path += ".pdf" with Path(save_path).open(mode="wb") as save_file: pdf_writer.write(save_file) messagebox.showinfo(MESSAGE_TITLE, "PDF encrypted.")
def make_booklet(input_name, output_name, blanks=0): reader = PdfFileReader(open(input_name, "rb")) pages = [reader.getPage(p) for p in range(0, reader.getNumPages())] for i in range(0, blanks): pages.insert(0, None) sheets = build_booklet(pages) writer = PdfFileWriter() p0 = reader.getPage(0) input_width = p0.mediaBox.getWidth() output_width = input_width * 2 input_height = p0.mediaBox.getHeight() output_height = input_height page_size = (output_width, output_height) # We want to group fronts and backs together. for sheet in sheets: add_double_page(writer, page_size, sheet.back) for sheet in sheets: add_double_page(writer, page_size, sheet.front) writer.write(open(output_name, "wb")) print_instructions(sheets)
def rotate(input, output, pages, verbosity, rotate, **kwargs): """rotate selected pages Rotate selected pages and outputs in new pdf """ source = PdfFileReader(input) angle = {'left': -90, 'right': 90, 'inverted': 180}[rotate] if pages is None: pages = range(1, source.numPages) selection = [] for page_num in range(1, source.getNumPages()): real_page = page_num - 1 if verbosity >= 1: click.echo(".", nl=False) if verbosity >= 2: click.echo("Extracting page %s" % page_num) page = source.getPage(real_page) if page_num in pages: page._rotate(angle) selection.append(page) output_pdf = PdfFileWriter() for page in selection: output_pdf.addPage(page) if verbosity >= 1: click.echo("Writing %s" % output.name) output_pdf.write(output)
def ohin(obj_pages: List[int], save_path: str, input_path: str, img_path: str, position: List[float], img_size: List[float], pdf_size: str = "A4") -> None: overlay_pdf_maker("overlay.pdf", img_path, position, img_size, pdf_size) f_overlay = open("overlay.pdf", 'rb') overlay = PdfFileReader(f_overlay).getPage(0) f_target = open(input_path, 'rb') reader = PdfFileReader(f_target) num_pages = reader.getNumPages() assert num_pages >= max(obj_pages) for p in obj_pages: page = reader.getPage(p - 1) page.mergePage(overlay) writer = PdfFileWriter() for p in range(num_pages): page = reader.getPage(p) writer.addPage(page) with open(save_path, 'wb') as f: writer.write(f) f_overlay.close() f_target.close() os.remove("overlay.pdf")
def splitPdf(path='./input.pdf', N=5): if not os.path.isfile(path): return pdfFileWriter = PdfFileWriter() pdfFileReader = PdfFileReader(path) # 获取 PdfFileReader 对象 # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) numPages = pdfFileReader.getNumPages() # 文档总页数 print(numPages) # fname = os.path.splitext(os.path.basename(path))[0] fname = os.path.splitext(path)[0] outFile1 = '{}p1_{}.pdf'.format(fname, N) outFile2 = '{}p{}_{}.pdf'.format(fname, N + 1, numPages) print(outFile1, outFile2) if numPages > N: # 第N页之前的页面,输出到一个新的文件中,即分割文档 for index in range(N - 1): pageObj = pdfFileReader.getPage(index) pdfFileWriter.addPage(pageObj) # 添加完每页,再一起保存至文件中 pdfFileWriter.write(open(outFile1, 'wb')) # 从第N页之后的页面,输出到一个新的文件中,即分割文档 for index in range(N, numPages): pageObj = pdfFileReader.getPage(index) pdfFileWriter.addPage(pageObj) # 添加完每页,再一起保存至文件中 pdfFileWriter.write(open(outFile2, 'wb'))
def readPdf(readFile='./input.pdf'): # 获取 PdfFileReader 对象 pdfFileReader = PdfFileReader(readFile) # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) # 获取 PDF 文件的文档信息 documentInfo = pdfFileReader.getDocumentInfo() print('documentInfo = %s' % documentInfo) # 获取页面布局 pageLayout = pdfFileReader.getPageLayout() print('pageLayout = %s ' % pageLayout) # 获取页模式 pageMode = pdfFileReader.getPageMode() print('pageMode = %s' % pageMode) xmpMetadata = pdfFileReader.getXmpMetadata() print('xmpMetadata = %s ' % xmpMetadata) # 获取 pdf 文件页数 pageCount = pdfFileReader.getNumPages() print('pageCount = %s' % pageCount) for index in range(0, pageCount): # 返回指定页编号的 pageObject pageObj = pdfFileReader.getPage(index) print('index = %d , pageObj = %s' % (index, type(pageObj))) # <class 'PyPDF2.pdf.PageObject'> # 获取 pageObject 在 PDF 文档中处于的页码 pageNumber = pdfFileReader.getPageNumber(pageObj) print('pageNumber = %s ' % pageNumber)
def rotate_pdf(path, degrees, output): pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): original = pdf_reader.getPage(page) pdf_writer.addPage(original.rotateClockwise(int(degrees))) pdf_writer.write(output)
def merge_pdfs(pdf1_path, pdf2_path): pdf_writer = PdfFileWriter() pdf1_reader = PdfFileReader(pdf1_path, 'rb') pdf2_reader = PdfFileReader(pdf2_path, 'rb') for i in range(pdf1_reader.getNumPages()): page = pdf1_reader.getPage(i) pdf_writer.addPage(page) for i in range(pdf2_reader.getNumPages()): page = pdf2_reader.getPage(i) pdf_writer.addPage(page) with open('Resultant_PDF_After_Merging.pdf', 'wb') as fh: pdf_writer.write(fh) print("Merged the pdfs '" + path1 + "', and '" + path2 + "'.")
def join_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.write(output)
def on_file_selected(self): if self.file_selector.getpath(): pdf_reader = PdfFileReader(self.file_selector.getpath()) if pdf_reader.isEncrypted and not decrypt(pdf_reader, MESSAGE_TITLE): self.page_count_text.set("") self.file_selector.clear() else: self.page_count_text.set(pdf_reader.getNumPages())
def extract(fileobj): pfr = PdfFileReader(fileobj, strict=False) text = "" if fmt == "string" else [] for pg in range(pfr.getNumPages()): if fmt == "string": text += pfr.getPage(pg).extractText() else: text.append(pfr.getPage(pg).extractText()) return text
def extract_text(pdf_path): pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(pdf_path, 'rb') f = open('Text_Output.txt', 'w') i = 0 for i in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(i) content = page.extractText() f.write(content)
def extract_pdf_text(path, format="string"): with open(path, "rb") as fileobj: pfr = PdfFileReader(fileobj) text = "" if format == "string" else [] for pg in range(pfr.getNumPages()): if format == "string": text += pfr.getPage(pg).extractText() else: text.append(pfr.getPage(pg).extractText()) return text
def pdf_appendfile(inpath: str, appendpath: str, page_no: int, outpath: str): fname = os.path.splitext(os.path.basename(inpath))[0] if not outpath: outpath = '{}_output.pdf'.format(fname) pdf = PdfFileReader(inpath) pdf_a = PdfFileReader(appendpath) pdf_writer = PdfFileWriter() with open(outpath, 'wb') as out: for page in range(pdf.getNumPages()): pdf_writer.addPage(pdf.getPage(page)) if page + 1 == int(page_no): for page_a in range(pdf_a.getNumPages()): pdf_writer.addPage(pdf_a.getPage(page_a)) pdf_writer.write(out) print('Created: {}'.format(outpath))
def extractPdfPage(path, pageIndex, extractedPdfName): pdf = PdfFileReader(path) for page in range(pdf.getNumPages()): if page != pageIndex: continue pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf.getPage(page)) with open(extractedPdfName, 'wb') as output_pdf: pdf_writer.write(output_pdf)
def mergePdfFiles(paths, outputPath): pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) with open(outputPath, 'wb') as out: pdf_writer.write(out)
def split_to_single_pages(path, name_of_split): """ Splits one document into single pages """ pdf = PdfFileReader(path) for page in range(pdf.getNumPages()): pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf.getPage(page)) output = f'{name_of_split}{page}.pdf' with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf)
def add_pdf_subset(pdf_writer, input_path, page_start, page_end): my_pdf_writer = pdf_writer pdf_reader = PdfFileReader(input_path) if pdf_reader.getNumPages() < page_end: Exception("too few pages") for page in range(page_start, page_end): # Add each page to the writer object my_pdf_writer.addPage(pdf_reader.getPage(page))
def getPubFile(url): try: filename = wget.download(url) except HTTPError as e: logger.debug('Download error:', e.code, e.read()) else: pdf = PdfFileReader(open(filename, "rb")) content = "" for i in range(0, pdf.getNumPages()): content += pdf.getPage(i).extractText() + " \n" return content
def merge_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF with open(output, 'wb') as out: pdf_writer.write(out)