def get_metadata(file, key): with open(file, 'rb') as pdf: pdf_reader = PdfFileReader(pdf) metadata = pdf_reader.getDocumentInfo() print(metadata) property = metadata.get('/' + key, 'Key Error') return property
def test_merge_pdf_output(self): image_paths = [ 'tests/pdf_samples/jpeg_w_350.jpg', 'tests/pdf_samples/pdf_sample_A Sample PDF_loremIpsum_pages_01.pdf', 'tests/pdf_samples/pdf_sample_b_pages_01.pdf', 'tests/pdf_samples/pdf_sample_dummy_w3c_pages_01.pdf', 'tests/pdf_samples/pdf_sample_googledocs_image_pages_02.pdf', ## the next PDF fail to read - invalid literal for int() with base 10: b'F-1.4' !!! # 'tests/pdf_samples/pdf_sample_googlesheet_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_ISO19005_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_format_FDF_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_hibrid_format_pages_02.pdf', 'tests/pdf_samples/pdf_sample_libreoffice_exported_not_hybrid_ISO19005_pages_02.pdf', 'tests/pdf_samples/pdf_sample_pages_01.pdf', ('tests/pdf_samples/pdf_sample_readthedocs_pdf_networkdays_pages_019.pdf', (0, 2)), 'tests/pdf_samples/pdf_sample_text_edit_macos_pages_01.pdf', 'tests/pdf_samples/pdf_sample_wikimedia_org_pages_01.pdf', 'tests/pdf_samples/sample_pdf_commandline_xhtml2pdf_generated_pages_01.pdf', 'tests/pdf_samples/issue_repo_pypdf4.pdf', 'tests/pdf_samples/issue_repo_pypdf4_test.pdf', ] m = MergeToPdf(paths_list=image_paths, output_file_path='test_merged_pdf.pdf') m.merge_pdfs() with open('test_merged_pdf.pdf', "rb") as outputfile: generated_pdf = PdfFileReader(outputfile) pages = generated_pdf.getNumPages() self.assertEqual(pages, 23)
def merge(self): save_path = save_as_pdf() if not save_path: messagebox.showerror(MESSAGE_TITLE, "You must specify a file save path.") return if save_path[-4:].lower() != ".pdf": save_path += ".pdf" pdf_writer = PdfFileWriter() for item in self.tree.get_children(): item_values = self.tree.item(item, option="values") path = item_values[1] pdf_reader = PdfFileReader(path) if pdf_reader.isEncrypted and not decrypt(pdf_reader, MESSAGE_TITLE): messagebox.showwarning(MESSAGE_TITLE, f"{item_values[0]} could not be decrypted. It will not be " f"included in the merge.") continue for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) with Path(save_path).open(mode="wb") as save_file: pdf_writer.write(save_file) messagebox.showinfo(MESSAGE_TITLE, "PDF Merged")
def pdf_meta(filename): with open(filename,'rb') as pdf: pdfFile = PdfFileReader(pdf) doc = pdfFile.getDocumentInfo() print(f'[***] PDF MetaData: {str(filename)}') for item in doc: print(f'[++] {item} : {doc[item]}')
def extract_information(self, pdf_path, link): with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) information = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() try: readable1 = extract_date( information.getText("/CreationDate").split('-')) readable2 = extract_date( information.getText("/ModDate").split('-')) except: readable1 = None readable2 = None info = { "author": information.author, "creator": information.creator, "producer": information.producer, "subject": information.subject, "title": information.title, "creation_date": readable1, "modification_date": readable2, "number_of_pages": number_of_pages, "download_link": link } return pdf_path, info
def rotate_pdf(self, path: str, page_num: str, rotate_type: str, outpdf: str): """ 旋转pdf页面 path; 需要处理的pdf文件路径 page_num: 页面编号 rotate_type: 0或1,为顺时针或逆时针旋转 outpdf: 输出pdf名称,不包含路径 """ pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(path) # 顺时针旋转90° if rotate_type == "0": page_1 = pdf_reader.getPage(int(page_num)).rotateClockwise(90) pdf_writer.addPage(page_1) elif rotate_type == "1": # 逆时针旋转90° page_2 = pdf_reader.getPage( int(page_num)).rotateCounterClockwise(90) pdf_writer.addPage(page_2) else: return "输入错误,请重新输入!" with open(self.processed + outpdf, "wb") as f: pdf_writer.write(f) # print("旋转页面完成!") return "旋转页面完成!"
def encrypt_file(self): path = self.file_selector.getpath() if not path: messagebox.showerror(MESSAGE_TITLE, "You must select a PDF file.") return if not self.password.get(): messagebox.showerror(MESSAGE_TITLE, "You must enter a password.") return pdf_reader = PdfFileReader(path) if pdf_reader.isEncrypted: messagebox.showwarning(MESSAGE_TITLE, "File is already encrypted.") return pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(self.password.get()) save_path = save_as_pdf(parent=self) if not save_path: messagebox.showerror(MESSAGE_TITLE, "You must specify a file save path") if save_path[-4:].lower() != ".pdf": save_path += ".pdf" with Path(save_path).open(mode="wb") as save_file: pdf_writer.write(save_file) messagebox.showinfo(MESSAGE_TITLE, "PDF encrypted.")
def splitPages(testNameNum, testPath, pageRanges, outputDirs): temp = 1 key = 0 for bookmark in pageRanges: f = open(testPath, 'rb') pdf = PdfFileReader(f) pdfWriter = PdfFileWriter() #add watermark to original cover origCover = pdf.getPage(0) newCover = addWaterMark(origCover, key) #put new cover on the front pdfWriter.addPage(newCover) for page in range(temp, bookmark): pdfWriter.addPage(pdf.getPage(page)) outputFileName = genSectionFilePath(outputDirs, testNameNum[1], testNameNum[0], key) with open(outputFileName, 'wb') as out: pdfWriter.write(out) temp = bookmark key += 1 f.close()
def main(): parser = argparse.ArgumentParser( prog='ca6fix', description= "Fix some disappointmented points in Computer Architecture Quantitative Approach 6th Edition Japanese translation PDF file.", usage='ca6fix -i ca6.pdf -o ca6_fixed.pdf', add_help=True) parser.add_argument('-i', '--input', help='input PDF file', required=True) parser.add_argument('-o', '--output', help='output PDF file', required=True) args = parser.parse_args() reader = PdfFileReader(args.input) writer = PdfFileWriter() for p in range(reader.getNumPages()): page = reader.getPage(p) writer.addPage(page) writer.insertBlankPage(None, None, 4) for index in outline: add_outline(writer, index, 21) writer.setPageLayout('/TwoPageRight') writer.addMetadata({ '/Title': 'コンピュータアーキテクチャ 定量的アプローチ[第6版]', '/Author': 'ジョン・L・ヘネシー, デイビッド・A・パターソン(著), 中條拓伯, 天野英晴, 鈴木 貢(訳)' }) with open(args.output, 'wb') as fh: writer.write(fh)
def imprimir_metadata(nombreArchivo): with open(nombreArchivo, 'rb') as pdf: pdf_file = PdfFileReader(pdf) doc_info = pdf_file.getDocumentInfo() print(f'[*] MetaData PDF para el archivo: {str(nombreArchivo)}') for meta_item in doc_info: print(f'[+] {meta_item}: {doc_info[meta_item]}')
def pdf_to_txt(file): text = '' pdfReader = PdfFileReader(file) for i in range(pdfReader.numPages): page = pdfReader.getPage(i) text += page.extractText() return text.lower()
def _merge_documents_PyPDF4(self, file_name, paths): """ Merge documents. """ output = settings.SAVE_PATH / file_name try: pdf_writer = PdfFileWriter() for file_path in paths: if file_path: pdf_reader = PdfFileReader(str(file_path), strict=False) for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF output = settings.SAVE_PATH / file_name with open(output, 'wb') as out: pdf_writer.write(out) return output except utils.PdfReadError as error: LogHandler.execution_log(error=error) LogHandler.execution_log( error=f'ERROR ON: {output.name.replace(".PDF", "")}') return output
def unwatermark_pdf(input_file: str, wm_text: str, pages: Tuple = None): """ Removes watermark from the pdf file. """ pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False) pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): # If required for specific pages if pages: if str(page) not in pages: continue page = pdf_reader.getPage(page) # Get the page content content_object = page["/Contents"].getObject() content = ContentStream(content_object, pdf_reader) # Loop through all the elements page elements for operands, operator in content.operations: # Checks the TJ operator and replaces the corresponding string operand (Watermark text) with '' if operator == b_("Tj"): text = operands[0] if isinstance(text, str) and text.startswith(wm_text): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) pdf_writer.addPage(page) return True, pdf_reader, pdf_writer
def split_pages(testnamenum, test_path, page_ranges): temp = 1 key = 0 for bookmark in page_ranges: f = open(test_path, 'rb') pdf = PdfFileReader(f) pdf_writer = PdfFileWriter() #add watermark to original cover orig_cover = pdf.getPage(0) new_cover = add_watermark(orig_cover, key) #put new cover on the front pdf_writer.addPage(new_cover) for page in range(temp, bookmark): pdf_writer.addPage(pdf.getPage(page)) local_filename = generate_section_filepath(CONST_LOCAL, testnamenum, key) # output_filename = generate_section_filepath(output_dirs, testnamenum, key) with open(local_filename, 'wb') as out: pdf_writer.write(out) # upload_dropbox(local_filename, output_filename) temp = bookmark key += 1 f.close()
def split(input, destination, pages, format, verbosity, **kwargs): """split pdf into single page file. pdfcli split document.pdf --format page-%02d.pd -p 1,10-20 """ source = PdfFileReader(input) if pages is None: pages = range(1, source.numPages + 1) to_dir = Path(destination) if not to_dir.exists(): to_dir.mkdir(parents=True) for page_num in pages: real_page = page_num - 1 if verbosity >= 1: click.echo("Extracting page %s" % page_num) # due to a bug PyPDF4 file need to be reopened source = PdfFileReader(input) dest_file = (to_dir / Path(format % page_num)).absolute() page = source.getPage(real_page) output_pdf = PdfFileWriter() output_pdf.addPage(page) with open(str(dest_file), "wb") as f: output_pdf.write(f)
def rotate(input, output, pages, verbosity, rotate, **kwargs): """rotate selected pages Rotate selected pages and outputs in new pdf """ source = PdfFileReader(input) angle = {'left': -90, 'right': 90, 'inverted': 180}[rotate] if pages is None: pages = range(1, source.numPages) selection = [] for page_num in range(1, source.getNumPages()): real_page = page_num - 1 if verbosity >= 1: click.echo(".", nl=False) if verbosity >= 2: click.echo("Extracting page %s" % page_num) page = source.getPage(real_page) if page_num in pages: page._rotate(angle) selection.append(page) output_pdf = PdfFileWriter() for page in selection: output_pdf.addPage(page) if verbosity >= 1: click.echo("Writing %s" % output.name) output_pdf.write(output)
async def add_watermark(file_path,stage,fileno): """把水印添加到pdf中""" #print('文件开始...' + str(datetime.now())) pdf_input = PdfFileReader(file_path) if pdf_input.isEncrypted: return pdf_info = pdf_input.getDocumentInfo() w, h = pdf_input.getPage(0).mediaBox[2:] # 页面尺寸转换为毫米 page = (int(w)*0.3528, int(h)*0.3528) # 创建水印文件 #create_watermark(page,stage,fileno) mark = create_watermark(page, stage, fileno) #mark=await asyncio.get_event_loop().run_in_executor(None, create_watermark, page, stage, fileno) # 读入水印pdf文件 #mark='d:/mark.pdf' pdf_output = await asyncio.get_event_loop().run_in_executor(None, merge, pdf_input, mark) #pdf_output = merge(file_path, mark) # 加密码 pdf_output.encrypt(user_pwd='', owner_pwd='12345',use_128bit=True) pdf_output.addMetadata(pdf_info) # 可以更改一些属性值 #pdf_output.addMetadata(info) #savepdf(pdf_output, file_path) await asyncio.get_event_loop().run_in_executor(None, savepdf, pdf_output, file_path)
def print_meta(filename): with open(filename, 'rb') as pdf: pdf_file = PdfFileReader(pdf) doc_info = pdf_file.getDocumentInfo() print(f'[*] PDF MetaData For: {str(filename)}') for meta_item in doc_info: print(f'[+] {meta_item}: {doc_info[meta_item]}')
def extract(input, output, pages, verbosity, **kwargs): """extract one or multiple pages and build a new document. pdfcli extract source.pdf -o clear.pdf -p 1,3-5 """ source = PdfFileReader(input) if pages is None: pages = range(1, source.numPages) selection = [] for page_num in pages: real_page = page_num - 1 if verbosity >= 1: click.echo(".", nl=False) if verbosity >= 2: click.echo("Extracting page %s" % page_num) selection.append(source.getPage(real_page)) output_pdf = PdfFileWriter() for page in selection: output_pdf.addPage(page) if verbosity >= 1: click.echo("Writing %s" % output.name) output_pdf.write(output)
def splitPdf(path='./input.pdf', N=5): if not os.path.isfile(path): return pdfFileWriter = PdfFileWriter() pdfFileReader = PdfFileReader(path) # 获取 PdfFileReader 对象 # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) numPages = pdfFileReader.getNumPages() # 文档总页数 print(numPages) # fname = os.path.splitext(os.path.basename(path))[0] fname = os.path.splitext(path)[0] outFile1 = '{}p1_{}.pdf'.format(fname, N) outFile2 = '{}p{}_{}.pdf'.format(fname, N + 1, numPages) print(outFile1, outFile2) if numPages > N: # 第N页之前的页面,输出到一个新的文件中,即分割文档 for index in range(N - 1): pageObj = pdfFileReader.getPage(index) pdfFileWriter.addPage(pageObj) # 添加完每页,再一起保存至文件中 pdfFileWriter.write(open(outFile1, 'wb')) # 从第N页之后的页面,输出到一个新的文件中,即分割文档 for index in range(N, numPages): pageObj = pdfFileReader.getPage(index) pdfFileWriter.addPage(pageObj) # 添加完每页,再一起保存至文件中 pdfFileWriter.write(open(outFile2, 'wb'))
def make_booklet(input_name, output_name, blanks=0): reader = PdfFileReader(open(input_name, "rb")) pages = [reader.getPage(p) for p in range(0, reader.getNumPages())] for i in range(0, blanks): pages.insert(0, None) sheets = build_booklet(pages) writer = PdfFileWriter() p0 = reader.getPage(0) input_width = p0.mediaBox.getWidth() output_width = input_width * 2 input_height = p0.mediaBox.getHeight() output_height = input_height page_size = (output_width, output_height) # We want to group fronts and backs together. for sheet in sheets: add_double_page(writer, page_size, sheet.back) for sheet in sheets: add_double_page(writer, page_size, sheet.front) writer.write(open(output_name, "wb")) print_instructions(sheets)
def join(ctx, inputs, output, verbosity, **kwargs): """join multiple pdf together in a single file. pdfcli join files*.pdf -o joined.pdf """ if not inputs: click.echo("No input files") ctx.exit(1) for input in inputs: if not Path(input).exists(): if verbosity >= 1: click.echo("File not found '%s'" % input, err=True) ctx.exit(1) out = PdfFileWriter() for input in inputs: source = PdfFileReader(input) if verbosity >= 1: click.echo("Adding %s" % input) for page_num in range(0, source.numPages): out.addPage(source.getPage(page_num)) out.write(output) if verbosity >= 1: click.echo("Writing %s" % output.name)
def generate_pdf(template_name, request, contexte, extra_pdf_files=None): template = get_template(template_name) contexte.update({ 'MEDIA_ROOT': settings.MEDIA_ROOT, 'cdate': now(), 'user': request.user }) html = template.render(contexte) try: result = BytesIO() pisa_status = pisa.CreatePDF(html, result) if not pisa_status.err: if extra_pdf_files: from PyPDF4 import PdfFileWriter, PdfFileReader output = PdfFileWriter() append_pdf(PdfFileReader(result), output) result = BytesIO() for pdf_file in extra_pdf_files: try: append_pdf(PdfFileReader(pdf_file), output) except Exception: return render(request, "pdf_error.html", { 'pdf': pdf_file, 'error': traceback.format_exc() }) output.write(result) return http.HttpResponse(result.getvalue(), content_type='application/pdf') except Exception as e: logger = logging.getLogger(__name__) logger.exception(e) return http.HttpResponse('Gremlins ate your pdf! %s' % cgi.escape(html))
def remove_watermark(wm_text, inputFile, outputFile): from PyPDF4 import PdfFileReader, PdfFileWriter from PyPDF4.pdf import ContentStream from PyPDF4.generic import TextStringObject, NameObject from PyPDF4.utils import b_ with open(inputFile, "rb") as f: source = PdfFileReader(f, "rb") output = PdfFileWriter() for page in range(source.getNumPages()): page = source.getPage(page) content_object = page["/Contents"].getObject() content = ContentStream(content_object, source) for operands, operator in content.operations: if operator == b_("Tj"): text = operands[0] if isinstance(text, str) and text.startswith(wm_text): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) output.addPage(page) with open(outputFile, "wb") as outputStream: output.write(outputStream)
def get_info(input_file: str): """ Extracting the file info """ # If PDF is encrypted the file metadata cannot be extracted with open(input_file, 'rb') as pdf_file: pdf_reader = PdfFileReader(pdf_file, strict=False) output = { "File": input_file, "Encrypted": ("True" if pdf_reader.isEncrypted else "False") } if not pdf_reader.isEncrypted: info = pdf_reader.getDocumentInfo() num_pages = pdf_reader.getNumPages() output["Author"] = info.author output["Creator"] = info.creator output["Producer"] = info.producer output["Subject"] = info.subject output["Title"] = info.title output["Number of pages"] = num_pages # To Display collected metadata print( "## File Information ##################################################" ) print("\n".join("{}:{}".format(i, j) for i, j in output.items())) print( "######################################################################" ) return True, output
def rotate_pdf(path, degrees, output): pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): original = pdf_reader.getPage(page) pdf_writer.addPage(original.rotateClockwise(int(degrees))) pdf_writer.write(output)
def readWritePdf(): with open("./Python 面试题.pdf", "rb") as f: pdfReader = PdfFileReader(f) pdfWriter = PdfFileWriter() page = pdfReader.getPage(0) pdfWriter.addPage(page) with open("./new.pdf", "wb") as f1: pdfWriter.write(f1)
def merge(pdf_input, mark): pageNum = pdf_input.getNumPages() pdf_output = PdfFileWriter() pdf_watermark = PdfFileReader(open(mark, 'rb'), strict=False) for i in range(pageNum): page = pdf_input.getPage(i) page.mergePage(pdf_watermark.getPage(0)) pdf_output.addPage(page) return pdf_output
def on_file_selected(self): if self.file_selector.getpath(): pdf_reader = PdfFileReader(self.file_selector.getpath()) if pdf_reader.isEncrypted and not decrypt(pdf_reader, MESSAGE_TITLE): self.page_count_text.set("") self.file_selector.clear() else: self.page_count_text.set(pdf_reader.getNumPages())
def extract(fileobj): pfr = PdfFileReader(fileobj, strict=False) text = "" if fmt == "string" else [] for pg in range(pfr.getNumPages()): if fmt == "string": text += pfr.getPage(pg).extractText() else: text.append(pfr.getPage(pg).extractText()) return text