def extract_information(pdf_path): testread = "" with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) information = pdf.getDocumentInfo() testread = pdf.getPage(92).extractText().strip() print(pdf.getPage(92).extractText().strip()) number_of_pages = pdf.getNumPages() # txt = f""" # Information about {pdf_path}: # Author: {information.author} # Creator: {information.creator} # Producer: {information.producer} # Subject: {information.subject} # Title: {information.title} # Number of pages: {number_of_pages} # """ print(testread) # define variables s = testread.strip() file = "file.mp3" # initialize tts, create mp3 and play tts = gTTS(s, 'en') tts.save(file) #os.system("mpg123 " + file) return information
def PDFMerge(savePath, pdfPath, watermarkPdfPath): # pdf파일 불러오기 pdfFile = open(pdfPath, 'rb') pdfReader = PdfFileReader(pdfFile, strict=False) # 워터마크 PDF파일 불러오기 watermarkPdfFile = open(watermarkPdfPath, 'rb') watermarkPdf = PdfFileReader(watermarkPdfFile, strict=False).getPage(0) pdfWriter = PdfFileWriter() #PDF 페이지 수만큼 반복 for pageNum in range(pdfReader.numPages): #페이지를 불러온다 pageObj = pdfReader.getPage(pageNum) #중앙으로 놓기 위해 좌표를 구한다 x = (pageObj.mediaBox[2] - watermarkPdf.mediaBox[2]) / 2 y = (pageObj.mediaBox[3] - watermarkPdf.mediaBox[3]) / 2 # 워터마크페이지와 합친다 pageObj.mergeTranslatedPage(page2=watermarkPdf, tx=x, ty=y, expand=False) #합친걸 저장할 PDF파일에 추가한다 pdfWriter.addPage(pageObj) #저장 resultFile = open(savePath, 'wb') pdfWriter.write(resultFile)
def pdf_mediabox(filename): pdf = PdfFileReader(open(filename, 'rb')) page = pdf.getPage(0).mediaBox width = page.getWidth() height = page.getHeight() return result_line(filename, round(float(height) * points_to_mm), round(float(width) * points_to_mm))
def test_PdfReaderFileLoad(self): ''' Test loading and parsing of a file. Extract text of the file and compare to expected textual output. Expected outcome: file loads, text matches expected. ''' with open(os.path.join(RESOURCE_ROOT, 'crazyones.pdf'), 'rb') as inputfile: # Load PDF file from file ipdf = PdfFileReader(inputfile) ipdf_p1 = ipdf.getPage(0) # Retrieve the text of the PDF with open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'rb') as pdftext_file: pdftext = pdftext_file.read() ipdf_p1_text = ipdf_p1.extractText().replace('\n', '').encode('utf-8') # Compare the text of the PDF to a known source self.assertEqual( ipdf_p1_text, pdftext, msg= 'PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (pdftext, ipdf_p1_text))
def test_PdfReaderJpegImage(self): ''' Test loading and parsing of a file. Extract the image of the file and compare to expected textual output. Expected outcome: file loads, image matches expected. ''' with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile: # Load PDF file from file ipdf = PdfFileReader(inputfile) # Retrieve the text of the image with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file: imagetext = pdftext_file.read() ipdf_p0 = ipdf.getPage(0) xObject = ipdf_p0['/Resources']['/XObject'].getObject() data = xObject['/Im4'].getData() # Compare the text of the PDF to a known source self.assertEqual( binascii.hexlify(data).decode(), imagetext, msg= 'PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' % (imagetext, binascii.hexlify(data).decode()))
def split(file): """ Esse método irá separar página por página do arquivo que o usuário escolher e as salvar no 'output directory' como novos arquivos pdf. Cada arquivo corresponderá à uma página do documento original. :param file: O arquivo escolhido pelo usuário para fazer a separação das páginas :return: None """ # Limpando o diretório para evitar duplicidade em arquivos/diretórios Splitter.cleanDir() # Tratando o nome do arquivo file = Merger.toPath(file) # Lógica para separação das páginas dos arquivos PDF's e nova nomeclatura para os mesmos with open(file, mode='rb') as pdf_file_to_read: file_length = PdfFileReader(pdf_file_to_read).numPages for page in range(file_length): pdf_file = PdfFileReader(pdf_file_to_read) current_page = PdfFileWriter() current_page.addPage(pdf_file.getPage(page)) with open(join(Splitter.splitter_dir, f"página_{page + 1}.pdf"), mode='wb') as pdf: current_page.write(pdf)
def pdf_metadata_save(pdf_file, metadata, substitute_all_metadata = False, make_backup = True): if type(make_backup) is str: bak_file = make_backup else: bak_file = os.path.splitext(pdf_file)[0] + ".bak" os.rename(pdf_file, bak_file) with open(bak_file, 'rb') as fin: pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo if not substitute_all_metadata: for key in info: #infoDict.update({NameObject(key): createStringObject(info[key])}) infoDict.update({key: info[key]}) for key in metadata: infoDict.update({NameObject('/' + key): createStringObject(str(metadata[key]))}) with open(pdf_file, 'wb') as fout: writer.write(fout) if make_backup == False: os.unlink(bak_file)
def split_pdf(myfile): pdf_in_file = open('/tmp/' + myfile, 'rb') inputpdf = PdfFileReader(pdf_in_file) pages_no = inputpdf.numPages print(pages_no) output = PdfFileWriter() for i in range(pages_no // 50): output.addPage(inputpdf.getPage(i * 50)) if i * 50 + 1 < inputpdf.numPages: output.addPage(inputpdf.getPage(i * 50 + 1)) print('/tmp/document-page%s.pdf' % i) newname = 'document-page%s.pdf' % i print(newname) with open("/tmp/document-page%s.pdf" % i, "wb") as outputStream: output.write(outputStream) client.upload_file('/tmp/' + newname, destbucketName, 'extracted-pdf/' + newname)
def pypdf3(): """Much slower than PyPDF3 method.""" # 5b. Get our files ready document_reader = PdfFileReader(document) output_file = PdfFileWriter() # Number of pages in input document page_count = document_reader.getNumPages() # Watermark objects watermark_reader = PdfFileReader(watermark) wtrmrk_page = watermark_reader.getPage(0) wtrmrk_width = (wtrmrk_page.mediaBox.getWidth() / 2) + 0 wtrmrk_height = (wtrmrk_page.mediaBox.getHeight() / 2) + 80 wtrmrk_rotate = -int(Info(watermark_reader).rotate) if Info(watermark_reader).rotate is not None else 0 # 5c. Go through all the input file pages to add a watermark to them for page_number in range(page_count): # Merge the watermark with the page if not self.underneath: input_page = document_reader.getPage(page_number) if wtrmrk_rotate != 0: input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height) else: wtrmrk_width = 0 wtrmrk_height = 0 input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height) else: size = Info(document_reader).dimensions input_page = PageObject().createBlankPage(document_reader, size['w'], size['h']) if wtrmrk_rotate != 0: input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height) else: wtrmrk_width = 0 wtrmrk_height = 0 input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height) input_page.mergePage(document_reader.getPage(page_number)) # Add page from input file to output document output_file.addPage(input_page) # 5d. finally, write "output" to PDF with open(output_filename, "wb") as outputStream: output_file.write(outputStream) return output_filename
def overlay_pdfs(top_pdf, bottom_pdf, destination): """ Overlay PDF objects to files :param top_pdf: PDF object to be placed on top :param bottom_pdf: PDF file to be placed underneath :param destination: Desintation path """ drawing = PdfFileReader(top_pdf) # Create new PDF object template = PdfFileReader(bottom_pdf) # read your existing PDF # add the "watermark" (which is the new pdf) on the existing page page = template.getPage(0) page.mergePage(drawing.getPage(0)) output = PdfFileWriter() # Create new PDF file output.addPage(page) # finally, write "output" to a real file with open(destination, "wb") as outputStream: output.write(outputStream)
def main(): if (len(sys.argv) != 3): print("usage: python 2-up.py input_file output_file") sys.exit(1) print("2-up input " + sys.argv[1]) input1 = PdfFileReader(open(sys.argv[1], "rb")) output = PdfFileWriter() for iter in range(0, input1.getNumPages() - 1, 2): lhs = input1.getPage(iter) rhs = input1.getPage(iter + 1) lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True) output.addPage(lhs) print(str(iter) + " "), sys.stdout.flush() print("writing " + sys.argv[2]) outputStream = file(sys.argv[2], "wb") output.write(outputStream) print("done.")
def start_Encryption(self): global filename try: if self.filename[0] and self.userPassword.text( ) and self.ownerPassword.text(): pfw = PdfFileWriter() pdffile = PdfFileReader(self.filename[0]) total_pages = pdffile.numPages for page in range(total_pages): current_page = pdffile.getPage(page) pfw.addPage(current_page) pfw.encrypt(self.userPassword.text(), self.ownerPassword.text()) file = open(self.filename[0].replace('.pdf', '_encrypted.pdf'), 'wb') pfw.write(file) file.close() msg = QMessageBox() msg.setWindowTitle('Done') msg.setIcon(QMessageBox.Information) msg.setText('File encryption done successfully.') msg.exec_() self.filename = '' self.userPassword.setText('') self.ownerPassword.setText('') self.userPassword.setDisabled(True) self.ownerPassword.setDisabled(True) self.startEncryption.setDisabled(True) else: if self.ownerPassword.text() == '': msg = QMessageBox() msg.setWindowTitle('Error') msg.setIcon(QMessageBox.Critical) msg.setText('Owner Password Field is Empty.') msg.exec_() if self.userPassword.text() == '': msg = QMessageBox() msg.setWindowTitle('Error') msg.setIcon(QMessageBox.Critical) msg.setText('User Password Field is Empty.') msg.exec_() except: pass
def pdf_metadata_load(pdf_file): with open(pdf_file, 'rb') as fin: pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() return pdf_in.documentInfo
def pypdf3(self): with open(self.file_name, 'rb') as pdf_in: pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(pdf_in) for pagenum in range(pdf_reader.numPages): page = pdf_reader.getPage(pagenum) page.rotateClockwise(self.rotation) pdf_writer.addPage(page) with open(self.outfn, 'wb') as pdf_out: pdf_writer.write(pdf_out) return self.outfn
def merge_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF with open(output, 'wb') as out: pdf_writer.write(out)
def pypdf3(self): reader = PdfFileReader(self.file_name) writer = PdfFileWriter() # Number of pages in input document page_count = reader.getNumPages() for page_number in range(page_count): wtrmrk = reader.getPage(page_number) page = PageObject.createBlankPage(width=self.target_w, height=self.target_h) page.mergeScaledTranslatedPage(wtrmrk, self.scale, self.margin_x, self.margin_y) writer.addPage(page) with open(self.output, "wb") as outputStream: writer.write(outputStream) return self.output
def prepare(self): # Process PDF input file to raw text file with open(self.inputPath, "rb") as fh: reader = PdfFileReader(fh) for page in tqdm(range(0, reader.numPages)): page_text = reader.getPage(page).extractText() print("Reading page", page, "of", reader.getNumPages()) filename = join_paths("./.TXT", hash(self.inputPath)) with open(filename, "a") as fh: fh.write(page_text) # Cleaning the TEXT file for better processing with open(filename, "r") as fh: lines = fh.readlines() lines = [l.replace("\n", "").replace("\r", "") for l in lines] with open(filename, "w") as fh: fh.writelines(lines) print("Cleaning... => ", filename) self.transform(filename)
def reorder(input_filename: str, output_filename: str) -> None: assert os.path.exists(input_filename) assert os.path.exists(output_filename) is False input_stream = open(input_filename, 'rb') output = PdfFileWriter() input_pdf = PdfFileReader(input_stream) pages = input_pdf.getNumPages() order = _make_sequence(pages) for page_number in order: page = input_pdf.getPage(page_number) output.addPage(page) output_stream = open(output_filename, "wb") output.write(output_stream) input_stream.close() output_stream.close()
def write_pdf(pdf_obj, destination): """ Write PDF object to file :param pdf_obj: PDF object to be written to file :param destination: Desintation path """ reader = PdfFileReader(pdf_obj) # Create new PDF object writer = PdfFileWriter() page_count = reader.getNumPages() # add the "watermark" (which is the new pdf) on the existing page for page_number in range(page_count): page = reader.getPage(page_number) writer.addPage(page) # finally, write "output" to a real file with open(destination, "wb") as outputStream: writer.write(outputStream)
def add_encryption(path, encryptPath, fileDicts): pdf_writer = PdfFileWriter() for fileName in fileDicts: input_pdf = os.path.join(path, fileName) output_pdf = os.path.join(encryptPath, fileName) pdf_reader = PdfFileReader(input_pdf) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(user_pwd=fileDicts[fileName], owner_pwd=None, use_128bit=True) #输出文件已存在便删除 if os.path.exists(output_pdf): os.remove(output_pdf) with open(output_pdf, 'wb') as fh: pdf_writer.write(fh)
def slicer(document, first_page=None, last_page=None, suffix='sliced', tempdir=None): """Slice a PDF document to remove pages.""" # Set output file name if tempdir: with NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False) as temp: output = temp.name elif suffix: output = os.path.join(os.path.dirname(document), add_suffix(document, suffix)) else: with NamedTemporaryFile(suffix='.pdf') as temp: output = temp.name # Reindex page selections for simple user input first_page = first_page - 1 if not None else None # Validate page range by comparing selection to number of pages in PDF document pages = Info(document).pages invalid = 'Number of pages: ' + str( pages) + ' ----> Page Range Input: ' + str(first_page) + '-' + str( last_page) assert first_page <= last_page <= pages, invalid pdf = PdfFileReader(document) writer = PdfFileWriter() pages = list(range(pdf.getNumPages()))[first_page:last_page] for page in pages: writer.addPage(pdf.getPage(page)) with open(output, 'wb') as out: writer.write(out) return output
def rename(pdf,doi): #inpfn = 'Chem. Rev. 2019, 119, 10241-10287-VIP-acs.chemrev.9b00008.pdf' fin = open(pdf, 'rb') pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo for key in info: infoDict.update({NameObject(key): createStringObject(info[key])}) print(key[0]+':'+ info[key]) # add the grade infoDict.update({NameObject('/doi'): createStringObject(u''+doi)}) # It does not appear possible to alter in place. temppdf=pdf+'.temppdf' fout = open(temppdf, 'wb') writer.write(fout) fin.close() fout.close() import os os.unlink(pdf) os.rename(temppdf, pdf) print('The DOI have been updated to:{0}'.format(doi))
def rotate(file_name, rotate, suffix='rotated', tempdir=None): """Rotate PDF by increments of 90 degrees.""" # Set output file name if tempdir: outfn = NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False).name elif suffix: outfn = os.path.join(os.path.dirname(file_name), add_suffix(file_name, suffix)) else: outfn = NamedTemporaryFile(suffix='.pdf').name with open(file_name, 'rb') as pdf_in: pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(pdf_in) for pagenum in range(pdf_reader.numPages): page = pdf_reader.getPage(pagenum) page.rotateClockwise(rotate) pdf_writer.addPage(page) with open(outfn, 'wb') as pdf_out: pdf_writer.write(pdf_out) return outfn
def split(filename: str, chapters: List[Chapter], directory: str) -> None: assert os.path.exists(filename) for chapter in chapters: print( f'Writing to {chapter.name} pages {chapter.start} to {chapter.stop}' ) # The file should have been opened before the previous loop, # however, there is a bug in the library, and this is the only way to # get around it. with open(filename, 'rb') as input_stream: input_pdf = PdfFileReader(input_stream) output = PdfFileWriter() for page_number in range(chapter.start - 1, chapter.stop): page = input_pdf.getPage(page_number) output.addPage(page) output_filename = os.path.join(directory, chapter.name) with open(output_filename, "wb") as output_stream: output.write(output_stream) output_stream.flush()
def upscale(file_name, scale=1.5, margin_x=0, margin_y=0, suffix='scaled', tempdir=None): """Upscale a PDF to a large size.""" # Set output file name if tempdir: output = NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False).name elif suffix: output = os.path.join(os.path.dirname(file_name), add_suffix(file_name, suffix)) else: output = NamedTemporaryFile(suffix='.pdf').name reader = PdfFileReader(file_name) writer = PdfFileWriter() dims = dimensions(file_name) target_w = dims['w'] * scale target_h = dims['h'] * scale # Number of pages in input document page_count = reader.getNumPages() for page_number in range(page_count): wtrmrk = reader.getPage(page_number) page = PageObject.createBlankPage(width=target_w, height=target_h) page.mergeScaledTranslatedPage(wtrmrk, scale, margin_x, margin_y) writer.addPage(page) with open(output, "wb") as outputStream: writer.write(outputStream) return output
def Extract_Code_From_PDF(ip_pdf_file, op_pdf_file, code_type): output = PdfFileWriter() input1 = PdfFileReader(open(ip_pdf_file, "rb")) output_page = input1.getPage(0) #bar code if 'bar' in code_type.lower(): output_page.cropBox.lowerLeft = (0, 0) output_page.cropBox.upperleft = (0, 100) output_page.cropBox.lowerRight = (286, 0) output_page.cropBox.upperRight = (286, 100) #Data Matrix code if 'matrix' in code_type.lower(): output_page.cropBox.lowerLeft = (200, 309) output_page.cropBox.upperleft = (200, 378) output_page.cropBox.lowerRight = (270, 309) output_page.cropBox.upperRight = (270, 378) output.addPage(output_page) outputStream = open(op_pdf_file, "wb") output.write(outputStream)
def compile_journal(directory, pad_path=None, folio_size=8, starting_page_num=1): pdfs = [f for f in os.listdir(directory) if '.pdf' in f and f[0:2].isdigit()] pdfs.sort() folios = [] while len(pdfs) > 0: folio = [] for i in range(8): path = pdfs.pop(0) reader = PdfFileReader(path) pdf = reader.getPage(0) folio.append(pdf) if len(pdfs) == 0: break folios.append(folio) joined_folios = [] for i,folio in enumerate(folios): joined_folios.append(build_folio(folio,None,i*len(folio)+starting_page_num)) index = PdfFileWriter() for folio in joined_folios: for page in folio: index.addPage(page) index.write(open('out.pdf','wb'))
def pdfMerge(self, savePath, pdfPath, watermarkPdfPath): pdfFile = open(pdfPath, 'rb') pdfReader = PdfFileReader(pdfFile, strict=False) watermarkPdfFile = open(watermarkPdfPath, 'rb') watermarkPdf = PdfFileReader(watermarkPdfFile, strict=False).getPage(0) pdfWriter = PdfFileWriter() for pageNum in range(pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) x = (pageObj.mediaBox[2] - watermarkPdf.mediaBox[2]) / 2 y = (pageObj.mediaBox[3] - watermarkPdf.mediaBox[3]) / 2 pageObj.mergeTranslatedPage(page2=watermarkPdf, tx=x, ty=y, expand=False) pdfWriter.addPage(pageObj) resultFile = open(savePath, 'wb') pdfWriter.write(resultFile)
def extract_text(pdf: PdfFileReader, page: int) -> str: # For some reason, the extracted text contains a lot of superfluous newlines return pdf.getPage(page).extractText().replace('\n', '')
from PyPDF3 import PdfFileWriter, PdfFileReader output = PdfFileWriter() input1 = PdfFileReader(open("document1.pdf", "rb")) # print how many pages input1 has: print "document1.pdf has %d pages." % input1.getNumPages() # add page 1 from input1 to output document, unchanged output.addPage(input1.getPage(0)) # add page 2 from input1, but rotated clockwise 90 degrees output.addPage(input1.getPage(1).rotateClockwise(90)) # add page 3 from input1, rotated the other way: output.addPage(input1.getPage(2).rotateCounterClockwise(90)) # alt: output.addPage(input1.getPage(2).rotateClockwise(270)) # add page 4 from input1, but first add a watermark from another PDF: page4 = input1.getPage(3) watermark = PdfFileReader(open("watermark.pdf", "rb")) page4.mergePage(watermark.getPage(0)) output.addPage(page4) # add page 5 from input1, but crop it to half size: page5 = input1.getPage(4) page5.mediaBox.upperRight = (page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2) output.addPage(page5) # add some Javascript to launch the print window on opening this PDF.