def merge(self, pdf_one, pdf_two, filename='my.pdf', output_dir='D:/pdf/'): ''' function:#pdfone为扫描的正面;#pdftwo为扫描的背面;#本函数实现将两个扫描文件按原有的顺序合并起来 :param pdf_one: :param pdf_two: :param filename: :param output_dir: :return: ''' input_one = open(pdf_one, 'rb') input_two = open(pdf_two, 'rb') pdf_input_one = PdfFileReader(input_one) pdf_input_two = PdfFileReader(input_two) numOne = pdf_input_one.getNumPages() numTwo = pdf_input_two.getNumPages() print(numOne, numTwo) pdf_output = PdfFileWriter() index_one = 0 index_two = numTwo - 1 while True: if index_one == numOne: break print(index_one, index_two) page1 = pdf_input_one.getPage(index_one) pdf_output.addPage(page1) page2 = pdf_input_two.getPage(index_two) pdf_output.addPage(page2) index_one += 1 index_two -= 1 pdf_name = output_dir + filename output_stream = open(pdf_name, 'wb') pdf_output.write(output_stream) output_stream.close() input_one.close() input_two.close() print('Done!')
def PdfPrettyPrint(inputname, outputname): inputfile = open(inputname, 'rb') wrt = PdfFileWriter() ipt = PdfFileReader(inputfile) #print ipt.getDocumentInfo() pdfnums = ipt.getNumPages() #print pdfnums i = 0 while i < pdfnums: page = ipt.getPage(i) wrt.addPage(page) if i + 2 < pdfnums: page = ipt.getPage(i + 2) wrt.addPage(page) else: wrt.addBlankPage() if i + 1 < pdfnums: page = ipt.getPage(i + 1) page.rotateClockwise(180) wrt.addPage(page) else: wrt.addBlankPage() if i + 3 < pdfnums: page = ipt.getPage(i + 3) page.rotateClockwise(180) wrt.addPage(page) else: wrt.addBlankPage() i = i + 4 fl = open(outputname, "wb") wrt.write(fl) inputfile.close() fl.close() return True
def generate_a_pdf(filename, num_pages, dir=None): """function to generate a random PDF file of N pages with single image per page taken from https://stackoverflow.com/questions/2925484/place-image-over-pdf Args: filename (str): path to save the pdf file num_pages (int): number of pages to make the pdf file KWArgs: dir (str): the path to the directory to save the pdf file Returns: str. path to the new pdf file """ pdf = PdfFileWriter() for num in range(1, num_pages+1): imgTemp = BytesIO() jpeg_path = make_a_jpeg('{}.jpeg'.format(str(num)), pick_a_color(num)) imgDoc = canvas.Canvas(imgTemp, pagesize=A4) imgDoc.drawImage(jpeg_path, 25, 45) imgDoc.save() pdf.addPage(PdfFileReader(BytesIO(imgTemp.getvalue())).getPage(0)) remove(jpeg_path) if dir: path = join(dir, filename) else: path = join(getcwd(), filename) pdf.write(open(path, 'wb')) return path
def generate_images(path, save_dir_name, is_train): if not os.path.exists('png_files/'): os.mkdir('png_files/') train_images = 'train_images/' test_images = 'test_images/' if is_train: save_directory_path = 'png_files/'+ train_images + save_dir_name + '_annotated_images' else: save_directory_path = 'png_files/' + test_images + save_dir_name + '_annotated_images' if not os.path.exists(save_directory_path): os.makedirs(save_directory_path) filename = path print("Converting " + filename + " from pdf to PNG...") reader = PdfFileReader(open(filename, mode="rb")) try: page_number = reader.getNumPages() except: page_number = reader.getNumPages() #PyPDF2 bug with tempfile.TemporaryDirectory() as path: images_from_path = convert_from_path(filename, dpi=72, output_folder=path, last_page=page_number, first_page=0) i = 0 for page in images_from_path: base_filename = os.path.splitext(os.path.basename(filename))[0] + '_' + str(i + 1) + '.png' page.save(os.path.join(save_directory_path, base_filename), 'PNG') i += 1 print('PDF file successfully converted.')
def test_cat(self): """Make sure files are properly concatenated.""" run_stapler(['cat', ONEPAGE_PDF, FIVEPAGE_PDF, self.outputfile]) self.assertTrue(os.path.isfile(self.outputfile)) with open(self.outputfile, 'rb') as outputfile: pdf = PdfFileReader(outputfile) self.assertEqual(pdf.getNumPages(), 6)
def translate(self): '''读取pdf内容,并翻译,写入txt文件''' f = open(self.fullPath, 'rb') pdf = PdfFileReader(f) index = 0 for i in range(0, pdf.getNumPages()): extractedText = pdf.getPage(i).extractText() content = extractedText.split('\n') content = self.removeBlankFromList(content) # 拼接之后的文本,如果单词间歇超过一个空格的,认为是需要换行处理的 content_list = self.enter_symbol(content) for line in content_list: line = line.strip() if line: ret = translate_func(line) trans = ret if ret else '翻译失败' self.write(line + '\n') self.write(trans) index += 1 print(index, end=' ', flush=True) f.close() Logger().write(self.fileName + '翻译完成,新文档:' + self.new_fullPath)
def metadata(path: Path) -> Metadata: """ Reads a given PDF file and produces a Metadata object. :param path: path to a PDF file :return: the metadata extracted from the PDF file """ with path.open('rb') as f: reader = PdfFileReader(f) info = reader.getDocumentInfo() page_count = reader.getNumPages() typer.echo(f'PDF metadata: {info}', err=True) # Decide which possible title to use: # - the title annotated in the PDF metadata # - the title read by pdftitle (largest text on the first page) # - the file name without extension pdftitle_title = pdftitle.get_title_from_file(str(path)) typer.echo(f'Title according to pdftitle: {pdftitle_title}', err=True) title_candidates = [t for t in [info.title, pdftitle_title, path.stem] if t is not None] # The current heuristic is just to use the longest of the three candidates title = max(title_candidates, key=len) return Metadata( title=title, author=info.author, page_count=page_count )
def finalize_print_preparation(self): """Take the resulting multi page PDF and split into rotated single pages Taken from `pythonlibrary.org <https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs -with-python/>`_ in combination with `johndcook.com <https://www.johndcook.com/blog/2015/05/01/rotating-pdf-pages-with-python/>`_ """ pdf: PdfFileReader = PdfFileReader(self._full_output_path_) for page_number in range(pdf.getNumPages()): pdf_writer: PdfFileWriter = PdfFileWriter() page: PageObject = pdf.getPage(page_number) page.rotateCounterClockwise(90) pdf_writer.addPage(page) output_filename: str = (f"{self._output_base_filename}_page_" f"{str(page_number + 1).zfill(2)}.pdf") with open( os.path.join(self._output_directory_name, output_filename), "wb") as pdf_out: pdf_writer.write(pdf_out) path_to_pdf = os.path.join(os.getcwd(), self._full_output_path_) print(f"Create {pdf.getNumPages()} single paged PDFs.\n\n" f"You can find them concatenated at file://" f"{path_to_pdf}")
def _create_pdf_from_rtf_files(self): pdfs = [] self.progress.emit(0) for count, file in enumerate(self.files): changed_file = change_filetype(file, "pdf", self.engine) pdfs.append(changed_file) self.progress.emit(count + 1) merger = PdfFileMerger() pages = [] chapters = [] for file in pdfs: read_pdf = PdfFileReader(file) txt = read_pdf.getPage(0) page_content = txt.extractText() try: chapter = helper_functions.get_chapter_from_pdf_txt( page_content) chapters.append(chapter) except: chapter = os.path.basename(file) chapter = chapter.split(".")[0] chapter = chapter.replace("_", " ") chapters.append(chapter) pages.append(read_pdf.getNumPages()) merger.append(fileobj=file) self.pages = pages self.chapters = chapters if not self.create_toc: merger.write(self.master_file_name) else: merger.write("tmp.pdf") merger.close() self.trash += pdfs
def test_zip(self): """Test zip.""" run_stapler(['zip', ONEPAGE_PDF, FIVEPAGE_PDF, self.outputfile]) self.assertTrue(os.path.isfile(self.outputfile)) with open(self.outputfile, 'rb') as outputfile: pdf = PdfFileReader(outputfile) self.assertEqual(pdf.getNumPages(), 6)
def test_sel_range(self): """Test select of more pages from a PDF file.""" run_stapler(['cat', 'A=' + FIVEPAGE_PDF, 'A2-4', self.outputfile]) self.assertTrue(os.path.isfile(self.outputfile)) with open(self.outputfile, 'rb') as outputfile: pdf = PdfFileReader(outputfile) self.assertEqual(pdf.getNumPages(), 3)
def test_del_range(self): """Test del command for inverse select multiple pages.""" run_stapler(['del', 'A=' + FIVEPAGE_PDF, 'A2-4', self.outputfile]) self.assertTrue(os.path.isfile(self.outputfile)) with open(self.outputfile, 'rb') as outputfile: pdf = PdfFileReader(outputfile) self.assertEqual(pdf.getNumPages(), 2)
def extract_from_file(file: IO[bytes], filename: str, mime_type: str, file_id: int) -> Tuple[Optional[str], Optional[int]]: """ Returns the text and the page count """ parsed_text = None page_count = None if mime_type == "application/pdf" or mime_type.startswith( "application/pdf;"): try: command = ["pdftotext", filename, "-"] completed = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) parsed_text = completed.stdout.decode("utf-8", "ignore") if completed.stderr: logger.info("pdftotext: {}".format(completed.stderr)) except CalledProcessError as e: logger.exception("File {}: Failed to run pdftotext: {}".format( file_id, e)) try: page_count = PdfFileReader(file, strict=False, overwriteWarnings=False).getNumPages() except (PdfReadError, KeyError): message = "File {}: Pdf does not allow to read the number of pages".format( file_id) logger.warning(message) elif mime_type == "text/text": parsed_text = file.read() else: logger.warning( f"File {file_id} has an unknown mime type: '{mime_type}'") return parsed_text, page_count
def extract_pdf_pypdf2(pdf_path): with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) if pdf.isEncrypted: pdf.decrypt('') page_obj = pdf.getPage(2) return page_obj.extractText()
def RemovePdfOwnerPassword(inputname, outputname): ''' ''' inputfile = open(inputname, 'rb') wrt = PdfFileWriter() ipt = PdfFileReader(inputfile) try: ipt.decrypt("") except KeyError as e: if e.message == '/Encrypt': print("%s is not an encrypted pdf" % inputname) return -1 else: raise e print(ipt.getDocumentInfo()) size = ipt.getNumPages() i = 0 while i < size: page = ipt.getPage(i) #print(page.extractText()) wrt.addPage(page) i = i + 1 fl = open(outputname, "wb") wrt.write(fl) inputfile.close() fl.close() return 0
def run(self): if self.beforeHandler(self._id, self.attachUrl): return filename = self.tempDir + str(random.random()) filename1 = self.tempDir + str(random.random()) + '.pdf' try: urllib.request.urlretrieve(self.attachUrl, filename) input_stream = open(filename, 'rb') pdf_input = PdfFileReader(input_stream) pdf_output = PdfFileWriter() page = 0 pages = pdf_input.getNumPages() - 1 # remove last page while page < pages: pdf_output.addPage(pdf_input.getPage(page)) page += 1 output_stream = open(filename1, 'wb') pdf_output.write(output_stream) output_stream.close() input_stream.close() if self.success is not None: self.success(self._id, filename1) except Exception as e: if self.error is not None: self.error(e, self.attachUrl) finally: if os.path.exists(filename): os.remove(filename) if os.path.exists(filename1): os.remove(filename1)
def readPDFfile(infile): pdf = PdfFileReader(infile, "rb")) content = "" num = pdf.getNumPages() for i in range(0, num): extractedText = pdf.getPage(i).extractText() content += extractedText + "\n" return content
def getDataUsingPyPdf2(filename): pdf = PdfFileReader(open(filename, "rb")) content = "" num = pdf.getNumPages() for i in range(0, num): extractedText = pdf.getPage(i).extractText() content += extractedText + "\n" return content
def getPdffileBookmark(filename, bookmark_file_savepath): pdf = PdfFileReader(open(filename, "rb")) pagecount = pdf.getNumPages() print('pagecount:', pagecount) pageLabels = { } #真实页码的索引 indirectRef “{'/Type': '/Fit', '/Page': IndirectObject(7871, 0), '/Title': '封面'}” for i in range(pagecount): page = pdf.getPage(i) pageLabels[page.indirectRef.idnum] = i + 1 # print(page.indirectRef.idnum,i+1) bookmark_file = codecs.open(bookmark_file_savepath, 'w', encoding='utf-8') title = [] pagedir = [] bookmark_jibie = [] outlines = pdf.getOutlines() print(outlines) index = 0 jibie = 0 for outline in outlines: index += 1 jibie = 0 print(len(outline), outline) if type(outline) == PyPDF2.generic.Destination: # print('dict--------') # print(list(outline.keys())) # for x,j in enumerate(list(outline.keys())): # print(str(outline[j])) # print(outline['/Title']) # print(outline['/Type']) # print(outline.page.idnum) bookmark_file.write(outline['/Title'] + '\t' + str(pageLabels[outline.page.idnum]) + '\r\n') if type(outline) == list: # print('list') jibie = 1 for i, outline in enumerate(outline): if type(outline) == PyPDF2.generic.Destination: bookmark_file.write('\t' * jibie + outline['/Title'] + '\t' + str(pageLabels[outline.page.idnum]) + '\r\n') elif type(outline) == list: jibie = 2 for i, o in enumerate(outline): if type(outline) == PyPDF2.generic.Destination: bookmark_file.write( '\t' * jibie + outline['/Title'] + '\t' + str(pageLabels[outline.page.idnum]) + '\r\n') # print('\n') # if index>=3: # break bookmark_file.close()
def convertPDFAlternative(self, path): from PyPDF2.pdf import PdfFileReader if not os.path.exists(path): return False pdf = PdfFileReader(open(path, "rb")) for i in range(0, pdf.getNumPages()): print(i) extractedText = pdf.getPage(i).extractText() self.pages.append(extractedText) return True
def merge_pdf(file_list, output_path): '''合并 PDF''' outpdf = PdfFileWriter() for f in file_list: f_pdf = PdfFileReader(open(f, 'rb')) for page in f_pdf.pages: outpdf.addPage(page) ous = open(output_path, 'wb') outpdf.write(ous) ous.close()
def test_split(self): """Make sure a file is properly split into pages.""" run_stapler(['split', FIVEPAGE_PDF]) filelist = os.listdir(self.tmpdir) self.assertEqual(len(filelist), 5) for f in os.listdir(self.tmpdir): with open(os.path.join(self.tmpdir, f), 'rb') as pdf_file: pdf = PdfFileReader(pdf_file) self.assertEqual(pdf.getNumPages(), 1)
def get(self, request, *args, **kwargs): fontname_g = "HeiseiMin-W3" pdfmetrics.registerFont(UnicodeCIDFont(fontname_g)) reader = PdfFileReader('media/pdf/riyuu-format4.pdf') writer = PdfFileWriter() buffer = io.BytesIO() cc = canvas.Canvas(buffer) cc.setFont(fontname_g, 11) initial = 295 before_rect_x = 748 after_rect_x = 776.5 line_height = 11.9 input_list = [{ 'label': '便器からの立ち座り', 'before_flag': True, 'after_flag': False }, { 'label': 'トイレまでの移動', 'before_flag': False, 'after_flag': True }, { 'label': 'トイレ出入口の出入(扉の開閉含む)', 'before_flag': True, 'after_flag': False }] welfare_equipment_material = PdfMaterial.objects.get( key="welfare_equipment") cc = self.motion_purpose_draw(cc, before_rect_x, after_rect_x, welfare_equipment_material.materials, input_list, initial, line_height) cc.showPage() cc.save() buffer.seek(0) new_pdf = PdfFileReader(buffer) existing_page = reader.getPage(0) existing_page.mergePage(new_pdf.getPage(0)) writer.addPage(existing_page) new = io.BytesIO() writer.write(new) new.seek(0) print('finish') return FileResponse(new, as_attachment=True, filename='hello.pdf')
def getDataUsingPyPdf2(filename): pdf = PdfFileReader(open(filename, "rb")) content = "" for i in range(0, pdf.getNumPages()): #print(str(i)) extractedText = pdf.getPage(i).extractText() content += extractedText + "\n" content = " ".join(content.replace("\xa0", " ").strip().split()) return content.encode("ascii", "ignore")
def _extract_pdf_forms(self, fname): """Extracts interactive form fields data from a PDF file. Parameters: fname (str): Path to PDF file. Returns: dict: Form fields data extracted. """ f = PdfFileReader(fname) return f.getFields()
def get(self, request, *args, **kwargs): fontname_g = "HeiseiKakuGo-W5" pdfmetrics.registerFont(UnicodeCIDFont(fontname_g)) buffer = io.BytesIO() cc = canvas.Canvas(buffer) reader = PdfFileReader('media/pdf/sample.pdf') existing_page = reader.getPage(0) cc.setFont(fontname_g, 24) cc.drawString(0, 820, "テスト") cc.showPage() cc.save() buffer.seek(0) new_pdf = PdfFileReader(buffer) existing_page.mergePage(new_pdf.getPage(0)) writer = PdfFileWriter() writer.addPage(existing_page) new = io.BytesIO() writer.write(new) new.seek(0) return FileResponse(new, as_attachment=True, filename='hello.pdf')
def test_render_to_pdf_bytes(self): doc = self._get_docx('complex_fields') data = doc.render(context={ 'complex': 'BEEES. AAAAH. BEEEEES', 'complex2': 'ಠ_ಠ unifying matrix conventions is the way of the future, Max.' }, format='pdf') # does pypdf like our output? PdfFileReader(BytesIO(data))
def pdfSplit(pdf_main, pdf_part): try: pdf_read_obj = PdfFileReader(pdf_main) pdf_write_obj = PdfFileWriter() page_num = pdf_read_obj.getNumPages() page_last_obj = pdf_read_obj.getPage(page_num - 1) page_last_obj.rotateClockwise(90) pdf_write_obj.addPage(page_last_obj) pdf_write_obj.write(open(pdf_part, 'wb')) return page_num - 1 except Exception as e: return False
def process_files(files): title_map = {} for file in files: # parse filename title, _, _ = util.parse_file(os.path.basename(file)) # if not already seen, add this file to the map if title not in title_map: title_map[title] = file continue # we need to merge pdfs into 1 merger = PdfFileMerger() with open(title_map[title], "rb") as f: merger.append(PdfFileReader(f)) with open(file, "rb") as f: merger.append(PdfFileReader(f)) merger.write(title_map[title]) os.remove(file) return title_map
def set_bglj(self, filename): if filename.endswith('.pdf'): self.user_data['bgms'] = '1' # 获取PDF 页码 try: pdf_read_obj = PdfFileReader(filename) self.user_data['bgym'] = pdf_read_obj.getNumPages() except Exception as e: self.user_data['bgym'] = 0 print("获取报告页码出错,错误信息:%s" % e) else: self.user_data['bgms'] = '0' self.user_data['bglj'] = self.cur_dir