def saveAs(self, fname): opdf=PdfWriter() #print type(opdf.trailer), type(opdf.trailer.Info), type(opdf.trailer.Info.Author) opdf.addpages(self.pdf.pages) opdf.trailer.Info=self.pdf.Info opdf.trailer.Root.Outlines=self.pdf.Root.Outlines opdf.write(fname)
def two_up(data): pdf = PdfReader(fdata=data) pages = PageMerge() + pdf.pages assert len(pages) == 2 left, right = pages rotation = 270 scale = 0.7071067811865476 # sqrt(0.5) x_increment = scale * pages.xobj_box[2] left.Rotate = rotation left.scale(scale) right.Rotate = rotation right.scale(scale) right.x = x_increment writer = PdfWriter() writer.addpage(pages.render()) # retain and update metadata pdf.Info.Creator = 'modulo-nic.py %s' % __version__ writer.trailer.Info = pdf.Info sys.stdout.write('Content-Type: application/x-pdf\n\n') writer.write(sys.stdout)
def splitting(*varargs,filenameOut ="out"): if(len(varargs)<=1): raise IndexError("Errore: inserire almeno due file.") for file in varargs: if False == (isinstance(file,str)): raise ValueError("Errore: i file devono essere pdf") if False == (isinstance(filenameOut,str)): raise ValueError("Errore: il nome del file deve essere di tipo str") all = PdfWriter() numpage=float("inf") for file in varargs: reader = PdfReader(file) i=0 for page in reader.pages: i=i+1 if (numpage > i): numpage=i for i in range(numpage): for filename in varargs: reader = PdfReader(filename) all.addPage(reader.getPage(i)) if(filenameOut.endswith('.pdf') == False): filenameOut = filenameOut+'.pdf' all.write(filenameOut)
def splitting(filenameOut ="out",*varargs): for file in varargs: if False == (isinstance(file,str)): raise ValueError("Errore: i file devono essere pdf") if False == (isinstance(filenameOut,str)): raise ValueError("Errore: il nome del file deve essere di tipo str") all = PdfWriter() numpage=float("inf") for file in varargs: reader = PdfReader(file) i=0 for page in reader.pages: i=i+1 if (numpage > i): numpage=i for i in range(numpage): for filename in varargs: reader = PdfReader(filename) all.addPage(reader.getPage(i)) all.write(filenameOut+".pdf")
def go(inpfn, outfn): reader = PdfReader(inpfn, decompress=False) page, = reader.pages writer = PdfWriter() writer.addpage(adjust(page)) writer.trailer.Info = IndirectPdfDict(reader.Info) writer.write(outfn)
def combine_match_sheets(match_sheets): output_fn = os.path.join(match_sheet_dir, "combined_match_sheets.pdf") writer = PdfWriter() for match_sheet in match_sheets: writer.addpages(PdfReader(match_sheet).pages) writer.write(output_fn) return output_fn
def save_pdf(infile, outpages): trailer = PdfReader(infile) outfn = create_filename(infile) writer = PdfWriter() writer.addpages(outpages) writer.trailer.Info = trailer.Info writer.trailer.Info.Producer = "https://github.com/sgelb/impositioner" writer.write(outfn)
def main(): parser = argparse.ArgumentParser(description="Strip ResearchGate additions from a PDF") parser.add_argument("infile", metavar="input-filename", type=str, nargs=1, help="PDF file to process") parser.add_argument("outfile", metavar="output-filename", type=str, nargs=1, help="name for processed output file") args = parser.parse_args() # This regular expression matches the form of the ResearchGate # underlinings in the content streams. We match against a truncated form # of the distinctive RGB triplet because it's not always given with # the same accuracy. # "0.3333333333 0.6941176471 0.9607843137" regex = re.compile(r"""(0\.33333[0-9]+ 0\.694117[0-9]+ 0\.960784[0-9]+ RG \d+\.?\d* w \d+\.?\d* \d+\.?\d* m \d+\.?\d* \d+\.?\d* )l S""") dict_pages = PdfReader(args.infile[0]).pages def fix_stream(contents): # Look for underlinings and make them invisible. if not hasattr(contents, "stream"): return s = contents.stream # We identify RG underlinings by their (hopefully unique) # RGB colour triplet. if s is not None and regex.search(s): # Minimal change: change the line draw commands to # moves, so no line is drawn. It would be more # satisfying to remove the stream entirely, but it's # simpler and safer to preserve the file structure # (in particular, the stream length) wherever possible. contents.stream = regex.sub("\\1m\nS", s) for page in dict_pages: if "/Annots" in page: # Remove all annotations. This may of course cause some # collateral damage, but PDFs of articles don't usually have # annotations so probably this will just strip ResearchGate # links. If this becomes a problem, it should be easy to # identify RG annotations and remove only them. page.pop("/Annots") # There may be a stream in the Contents object and/or in its # children, so we check for both. fix_stream(page.Contents) for contents in page.Contents: fix_stream(contents) writer = PdfWriter() # Start at the second page to remove the ResearchGate cover sheet. for page in dict_pages[1:]: writer.addpage(page) writer.write(args.outfile[0])
def test_pdf(pdfname): outfn = os.path.join(outdir, hashlib.md5(pdfname).hexdigest() + '.pdf') print >> stderr, ' ->', outfn trailer = PdfReader(pdfname, decompress=False) try: trailer.Info.OriginalFileName = pdfname except AttributeError: trailer.OriginalFileName = pdfname writer = PdfWriter() writer.trailer = trailer writer.write(outfn)
def combine(inpfn, outfn, x, y, gap): # Read all pages from input file pages = PdfReader(inpfn).pages # Object to write output PDF writer = PdfWriter() while pages: writer.addpage(getPages(pages, x, y, gap)) writer.write(outfn)
def writepdf(): outfn = "pwat." + os.path.basename(pdf) trailer = PdfReader(pdf) trailer.Info.Creator = "NOT" trailer.Info.Author = "NOT" trailer.Info.Title = "NOT" trailer.Info.Producer = "NOT" trailer.Info.CreationDate = "6/6/6" trailer.Info.ModDate = "6/6/6" writer = PdfWriter() writer.trailer = trailer writer.write(outfn)
def makeOnePagers(filename='GPO-CONAN-REV-2014.pdf' ,path='pdf/'): infile = PdfReader(filename) pages = len(infile.pages) print(pages) for i in range(pages): p = infile.pages[i] if(p and len(p)>0): outfile = PdfWriter() outfile.addPage(p) try: outfile.write('pdf/pageindex-%s.pdf' % str(i)) except: pass print(i)
def get(self,id): inpfn = 'teste.pdf' ranges = [id] # assert ranges, "Expected at least one range" # ranges = ([int(y) for y in x.split('-')] for x in ranges) outfn = '%sfrag' % os.path.basename(inpfn) pages = PdfReader(inpfn).pages outdata = PdfWriter() # for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0], onerange[1]+1): outdata.addpage(pages[pagenum-1]) outdata.write(outfn) # pdfout = base64.encodestring(open(outfn,"rb").read()) # self.write('<iframe src="data:application/pdf;base64,'+pdfout+'" style="position:fixed; top:0px; left:0px; bottom:0px; right:0px; width:100%; height:100%; border:none; margin:0; padding:0; overflow:hidden; z-index:999999;"/>')
def merge(*varargs,merge_file): if(merge_file.endswith('.pdf')): merge_file = merge_file+".pdf" for x in varargs: if(isinstance(x,str) == False): raise Exception("Errore: Tutti i parametri devono essere stringhe.") writer = PdfWriter() files = [] for x in varargs : if x.endswith('.pdf'): files.add(x) else: raise Exception("Errore tutti i parametri devono terminare con .pdf") for fname in sorted(files): writer.addpages(PdfReader(os.path.join('pdf_file', fname)).pages) writer.write("output.pdf")
def consolidateAllSheets(self, subDir=None): """ not sure if this is neccessary or maybe I can send multiple sheets to the browser """ writer = PdfWriter() if subDir != None: directory = self.printdirectory+subDir else: directory = self.printdirectory files = [x for x in os.listdir(directory) if x.endswith('.pdf')] for fname in sorted(files): writer.addpages(PdfReader(os.path.join(directory, fname)).pages) writer.write(directory+"output.pdf") for x in os.listdir(directory): if x == 'output.pdf': continue else: os.remove(directory+x)
def merge(*varargs, filenameOut='merge_file'): if (len(varargs) <=1): raise Exception('Errore: utilizzare almeno due file.') if(not (isinstance(filenameOut,str)) ): raise Exception('Errore: filenameOut deve essere una stringa.') if(filenameOut.endswith('.pdf') == False): filenameOut = filenameOut + ".pdf" writer = PdfWriter() for fname in varargs: if(isinstance(fname,str) == False): raise ValueError("Errore: Tutti i parametri devono essere stringhe.") if not fname.endswith('.pdf'): raise Exception("Errore: tutti i parametri devono terminare con .pdf") reader = PdfReader(fname) writer.addpages(reader.pages) writer.write(filenameOut)
page.AA = PdfDict() # You probably should just wrap each JS action with a try/catch, # because Chrome does no error reporting or even logging otherwise; # you just get a silent failure. page.AA.O = make_js_action(""" try { %s } catch (e) { app.alert(e.message); } """ % (script)) page.Annots = PdfArray(annots) return page if len(sys.argv) > 1: js_file = open(sys.argv[1], 'r') fields = [] for line in js_file: if not line.startswith('/// '): break pieces = line.split() params = [pieces[1]] + [float(token) for token in pieces[2:]] fields.append(make_field(*params)) js_file.seek(0) out = PdfWriter() out.addpage(make_page(fields, js_file.read())) out.write('result.pdf')
def render(source, *, progress_cb=lambda x: None): # Exports the self as a PDF document to disk # progress_cb will be called with a progress percentage between 0 and # 100. This percentage calculation is split 50% for the rendering # of the lines and 50% merging with the base PDF file. This callback # also provides an opportunity to abort the process. If the callback # raises an error, this function will take steps to abort gracefullly # and pass the error upwards. vector = True # TODO: Different rendering styles source = sources.get_source(source) # If this is using a base PDF, the percentage is calculated # differently. uses_base_pdf = source.exists('{ID}.pdf') # Document metadata should already be loaded (from device) # ... # Generate page information # If a PDF file was uploaded, but never opened, there may not be # a .content file. So, just load a barebones one with a 'pages' # key of zero length, so it doesn't break the rest of the # process. pages = [] if source.exists('{ID}.content'): with source.open('{ID}.content', 'r') as f: pages = json.load(f).get('pages', []) # Render each page as a pdf tmpfh = tempfile.TemporaryFile() pdf_canvas = canvas.Canvas(tmpfh, (PDFWIDTH, PDFHEIGHT)) # TODO: check pageCompression # Don't load all the pages into memory, because large notebooks # about 500 pages could use up to 3 GB of RAM. Create them by # iteration so they get released by garbage collector. changed_pages = [] annotations = [] for i in range(0, len(pages)): page = document.DocumentPage(source, pages[i], i) if source.exists(page.rmpath): changed_pages.append(i) page.render_to_painter(pdf_canvas, vector) annotations.append(page.get_grouped_annotations()) progress_cb((i + 1) / len(pages) * 50) pdf_canvas.save() tmpfh.seek(0) # This new PDF represents just the notebook. If there was a # parent PDF, merge it now. if uses_base_pdf and not changed_pages: # Since there is no stroke data, just return the PDF data progress_cb(100) log.info('exported pdf') return source.open('{ID}.pdf', 'rb') # PDF exists, stroke data exists, so mix them together. if uses_base_pdf: rmpdfr = PdfReader(tmpfh) basepdfr = PdfReader(source.open('{ID}.pdf', 'rb')) else: basepdfr = PdfReader(tmpfh) # Alias, which is used for annotations and layers. rmpdfr = basepdfr # If making a 'layered' PDF (with optional content groups, # OCGs), associate the annoatations with the layer. # This property list is put into the rmpdfr document, which # will not have any existing properties. ocgprop = IndirectPdfDict(OCGs=PdfArray(), D=PdfDict(Order=PdfArray())) for i in range(0, len(basepdfr.pages)): basepage = basepdfr.pages[i] rmpage = rmpdfr.pages[i] # Apply OCGs apply_ocg = False #TODO configurable? bool(int(QSettings().value( #'pane/notebooks/export_pdf_ocg'))) if apply_ocg: ocgorderinner = do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations) else: ocgorderinner = None # Apply annotations to the rmpage. This must come after # applying OCGs, because the annotation may belong to # one of those groups. apply_annotations(rmpage, annotations[i], ocgorderinner) # If this is a normal notebook with highlighting, # just add the annotations and forget about the rest, # which are page geometry transformations. if uses_base_pdf: merge_pages(basepage, rmpage, i in changed_pages) progress_cb(((i + 1) / rmpdfr.numPages * 50) + 50) # Apply the OCG order. The basepdf may have already had OCGs # and so we must not overwrite them. NOTE: there are other # properties that ought to be carried over, but this is the # minimum required. if apply_ocg: if '/OCProperties' in basepdfr.Root: basepdfr.Root.OCProperties.OCGs += ocgprop.OCGs basepdfr.Root.OCProperties.D.Order += ocgprop.D.Order else: basepdfr.Root.OCProperties = ocgprop pdfw = PdfWriter() stream = tempfile.SpooledTemporaryFile(SPOOL_MAX) pdfw.write(stream, basepdfr) stream.seek(0) log.info('exported pdf') return stream
''' import sys import os # import find_pdfrw from pdfrw import PdfReader, PdfWriter inpfn = sys.argv[1] rotate = sys.argv[2] outfn = sys.argv[3] rotate = int(rotate) assert rotate % 90 == 0 # ranges = [[int(y) for y in x.split('-')] for x in ranges] trailer = PdfReader(inpfn) pages = trailer.pages ranges = [[1, len(pages)]] for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0] - 1, onerange[1]): pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or 0) + rotate) % 360 outdata = PdfWriter() outdata.trailer = trailer outdata.write(outfn)
def debug(event, context): # Get Source PDF to watermark filename = "sample.pdf" existing_pdf = PdfReader(open(filename, "rb")) # Get Dimensions of document to make corresponding sized watermark mbox = existing_pdf.pages[0].MediaBox mediabox = tuple(float(x) for x in mbox) with io.BytesIO() as packet: height = 40 width = mediabox[2] # create a new PDF with Reportlab can = canvas.Canvas(packet) can.setPageSize((width, height)) # Get Copyright content copyrightContent = getCopyrightContent() # Stylesheet additions stylesheet = getSampleStyleSheet() style_watermark = stylesheet["Normal"] style_watermark.alignment = TA_CENTER style_watermark.textColor = colors.Color(0, 0, 0, alpha=0.5) style_watermark.fontSize = 8 style_watermark.font = 'Helvetica' # Creating Paragraph copyright_paragraph = Paragraph(copyrightContent, style_watermark) # Creating Table to wrap Paragraph data = [[copyright_paragraph]] table = Table(data) table.setStyle( TableStyle([ ('BACKGROUND', (0, 0), (-1, -1), colors.Color(255, 255, 255, alpha=0.5)), ])) # Adding Table to Canvas table.wrapOn(can, math.floor(width), 15) table.drawOn(can, 0, 0) # Saving can.save() # Move to start of memory pointer packet.seek(0) # Setting up PDF as a PDFFileReader object watermark_input = PdfReader(packet) watermark = watermark_input.pages[0] # Iterate through pages, updating source file. for current_page in range(len(existing_pdf.pages)): print(f'page {current_page}') merger = PageMerge(existing_pdf.pages[current_page]) merger.add(watermark).render() # write the modified content to disk writer_output = PdfWriter() outputStream = open(f"processed_{filename}", "wb") with outputStream as pdfOutput: writer_output.write(pdfOutput, existing_pdf) print('Processed PDF - copyright added')
def copyrightParse(sourceKey, bucketName, context): # BOTO3 objects s3 = boto3.resource('s3') s3client = boto3.client('s3') object = s3.Object(bucketName, sourceKey) # Copyright Data metadata = object.metadata if "copyright" in metadata: return 'Copyright already exists - aborting' dateTimeObj = datetime.now() timestampStr = dateTimeObj.strftime("%d-%b-%Y (%H:%M:%S.%f)") metadata['copyright'] = timestampStr # Get prelim data from object with io.BytesIO(object.get()['Body'].read()) as pdf_content_sample: existing_pdf = PdfReader(pdf_content_sample) # Get Dimensions of document to make corresponding sized watermark mbox = existing_pdf.pages[0].MediaBox mediabox = tuple(float(x) for x in mbox) ### ReportLab implementation # Get Source PDF to watermark - Load single page to generate watermark to the right size # Create memory position for Watermark PDF with io.BytesIO() as packet: print('Loading PDF file - Watermark generation') height = 40 width = mediabox[2] # create a new PDF with Reportlab can = canvas.Canvas(packet) can.setPageSize((width, height)) # Get Copyright content copyrightContent = getCopyrightContent() # Stylesheet additions stylesheet = getSampleStyleSheet() style_watermark = stylesheet["Normal"] style_watermark.alignment = TA_CENTER style_watermark.textColor = colors.Color(0, 0, 0, alpha=0.5) style_watermark.fontSize = 8 style_watermark.font = 'Helvetica' # Creating Paragraph copyright_paragraph = Paragraph(copyrightContent, style_watermark) # Creating Table to wrap Paragraph data = [[copyright_paragraph]] table = Table(data) table.setStyle( TableStyle([ ('BACKGROUND', (0, 0), (-1, -1), colors.Color(255, 255, 255, alpha=0.5)), ])) # Adding Table to Canvas # Make sure the width is an integer! print(f'Table width set to {math.floor(width)}') table.wrapOn(can, math.floor(width), 15) table.drawOn(can, 0, 0) # Saving can.save() # Move to start of memory pointer packet.seek(0) watermark_input = PdfReader(packet) watermark = watermark_input.pages[0] # Iterate through pages, updating source file. for current_page in range(len(existing_pdf.pages)): merger = PageMerge(existing_pdf.pages[current_page]) merger.add(watermark).render() # write the modified content to disk writer_output = PdfWriter() outputStream = io.BytesIO() with outputStream as pdfOutput: writer_output.write(pdfOutput, existing_pdf) print('File written to PDFWriter') pdfOutput.seek(0) s3client.upload_fileobj(pdfOutput, bucketName, sourceKey, ExtraArgs={"Metadata": metadata}) status = f'Copyright Set: {timestampStr}' return status
letters = a, b, c""") parser.add_argument("--prefix", "-p", default="", help="prefix to the page labels") parser.add_argument("--firstpagenum", "-f", type=int, default=1, help="number to attribute to the first page of this index") parser.add_argument("--outfile", "-o", type=Path, default=None, metavar="out.pdf", help="Where to write the output file") options = parser.parse_args() reader = PdfReader(str(options.file.resolve())) if options.delete: labels = PageLabels() else: labels = PageLabels.from_pdf(reader) newlabel = PageLabelScheme(startpage=options.startpage - 1, style=options.type, prefix=options.prefix, firstpagenum=options.firstpagenum) labels.append(newlabel) # Write the new page labels to the PDF labels.write(reader) print("New labels to be written:") print("\n".join(map(str, labels))) writer = PdfWriter() writer.trailer = reader outfile = options.outfile or options.file writer.write(str(outfile.resolve())) print("Resulting pdf file created : {}".format(outfile))
if not isinstance(initial, list): initial = [initial] files = [] queue = initial[:] while bool(queue): current = queue.pop(0) if isfile(current) and splitext(current)[1] in ext: files.append(current) elif isdir(current): sub = [join(current,x) for x in listdir(current)] queue += sub logging.info("Found {} {} files".format(len(files), ext)) return files pdfs = get_data_files(args.directory, '.pdf') logging.info("Chopping pdfs") for pdf in pdfs: logging.info("Reading: {}".format(pdf)) data = PdfReader(pdf) edited = PdfWriter() for x in range(1, len(data.pages)): edited.addpage(data.pages[x]) out_name = join(args.out, split(pdf)[1]) logging.info("Writing to: {}".format(out_name)) edited.write(out_name) logging.info("-----------")
from pdfrw import PdfReader, PdfWriter import os directory=os.getcwd() print("Abdullah Faruk ÇİFTLER | farukciftler.com | linkedin.com/in/farukciftler/ \n") print("İyi işlerde kullanınız :) \n ") fname=input("Lütfen PDF dosyanızı bu programın olduğu klasöre attıktan sonra XYZ.pdf şeklinde giriniz: ") path= directory+'\\'+fname pdf=PdfReader(path) pages = len(PdfReader(path).pages) pagepdf=PdfReader(path).pages startingpage=1 while(pages>=1): print("Kalan sayfa sayısı: " + str(pages) + "\n ") splitpage=int(input("Lütfen baştan ayırmak istediğiniz sayfa sayısını giriniz: ")) parts = [(startingpage,startingpage+splitpage)] for part in parts: outdata = PdfWriter(f'{fname}_sayfa_{part[0]}_{part[1]-1}.pdf') for pagenum in range(*part): outdata.addpage(pagepdf[pagenum-1]) outdata.write() startingpage=startingpage+splitpage pages=pages-splitpage
def mergepdfs(titles, name): outfn = name + '.pdf' writer = PdfWriter() for inpfn in titles: writer.addpages(PdfReader(inpfn).pages) writer.write(outfn)
def makePdfs(input_CSV, input_pdf_template): #Loads CSV template and creates pdfs #Import CSV file with form information and create a python list with that information with open(input_CSV) as CSV_Template: lines = CSV_Template.readlines() cell = [] row = [] table = [] for line in lines: for characters in line: if characters == ',': cell = ''.join(cell) if cell != '': row.append(cell) cell = [] elif characters == '\n': table.append(row) row = [] cell = [] else: cell.append(characters) file_ammount = len(table[3]) - 1 #make list element equal sized based on the number of files requested to be created. Fill in empty cells with '' for rows in table: while len(rows) < file_ammount + 1: rows.append('') for cells in rows: if cells is None: cells = '' #Repeats file path, folder name, project name in table #also checks for blank entries in file names #Changes Current directory text to file path for i, rows in enumerate(table): for j, cells in enumerate(rows[1:]): if rows[j] == 'Current Directory': rows[j] = os.getcwd() try: if rows[j + 1] == '': if rows[j + 1] != rows[j] and i < 3: rows[j + 1] = rows[j] elif i < 4: print( 'error, there is a blank where there shouldnt be') else: break except: continue #put pdf names into list PDF_Names = [] for i in range(len(table[1][1:])): PDF_Names.append(table[1][i + 1] + '_' + table[0][i + 1]) working_directory = os.getcwd() + '/' folder = ['', ''] #load up pdf template template_pdf = pdfrw.PdfReader(input_pdf_template) annotations = template_pdf.pages[0][ANNOT_KEY] inputName = [] #Lets make some PDFs for i in range(file_ammount): #make directory for PDFs if needed folder[0] = table[1][i + 1] if folder[0] != folder[1] and not folder[0] in os.listdir( table[2][i + 1]): os.mkdir(working_directory + folder[0]) folder[1] = folder[0] #create PDF file paths destination_folder = working_directory + folder[0] + '/' file_name = table[3][i + 1] name_ending_each_file = table[0][i + 1] PDF_file_path = destination_folder + file_name + name_ending_each_file + '.pdf' inputName = inputName + [PDF_file_path] #create dictionary of form keys and items information data_table = [] for rows in table[4:]: data_table.append([rows[0], rows[i + 1]]) data_dict = dict(data_table) #Edit PDF template and make PDF for annotation in annotations: if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY: if annotation[ANNOT_FIELD_KEY]: key = annotation[ANNOT_FIELD_KEY][1:-1] if key in data_dict.keys(): annotation.update( pdfrw.PdfDict(V='{}'.format(data_dict[key]))) annotation.update( pdfrw.PdfDict(AP='{}'.format({'/N': (144, 0)}))) annotation.update( pdfrw.PdfDict(DA='{}'.format('/Helv 0 Tf 0 g'))) pdfrw.PdfWriter().write(PDF_file_path, template_pdf) assert inputName outfn = destination_folder + '/' + 'combined.pdf' writer = PdfWriter() for inpfn in inputName: writer.addpages(PdfReader(inpfn).pages) writer.write(outfn)
# -*- coding: utf-8 -*- import os, sys, datetime from pdfrw import PdfReader, PdfWriter writer = PdfWriter() now = datetime.datetime.now() data_path = os.getcwd() + "/data/" dir_path = data_path + str(now.year) + '_' + sys.argv[1] + "week" if not os.path.exists(dir_path + "/result"): os.mkdir(dir_path + "/result") files = [x for x in os.listdir(dir_path) if x.endswith('.pdf')] for fname in sorted(files, key = lambda x: int(x.split(".")[0])): print ("[" + fname + "] Merged") writer.addpages(PdfReader(os.path.join(dir_path, fname)).pages) writer.write(dir_path + "/result/"+ str(now.year) + "_" + sys.argv[1] + "_merge.pdf") print("\nENDED MERGE REPORT!")
from pdfrw import PdfReader, PdfWriter mall = PdfReader('Mall.pdf') text = PdfReader('kandidat.pdf') writer = PdfWriter() writer.addpage(mall.pages[0]) writer.addpage(mall.pages[1]) for page in text.pages: writer.addpage(page) writer.addpage(mall.pages[2]) writer.write('KarlJohannesKandidat.pdf')
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) srgb_colorspace = PdfDict() srgb_colorspace.indirect = True srgb_colorspace.N = 3 # Number of components (red, green, blue) with open(SRGB_ICC_FILENAME, "rb") as f: srgb_colorspace_stream = f.read() srgb_colorspace.Filter = [PdfName.FlateDecode] srgb_colorspace.stream = zlib.compress(srgb_colorspace_stream, 9).decode("latin-1") srgb_colorspace.Length1 = len(srgb_colorspace_stream) default_rgb_colorspace = PdfArray([PdfName.ICCBased, srgb_colorspace]) default_rgb_colorspace.indirect = True # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather(*[ make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages) ]) trailer = pdf_writer.trailer document_id = PdfString().from_bytes(os.urandom(16)) trailer.ID = [document_id, document_id] mark_info = PdfDict() mark_info.Marked = PdfBool(True) trailer.Root.MarkInfo = mark_info struct_tree_root = PdfDict() struct_tree_root.Type = PdfName.StructTreeRoot trailer.Root.StructTreeRoot = struct_tree_root metadata = PdfDict() metadata.indirect = True metadata.Type = PdfName.Metadata metadata.Subtype = PdfName.XML xmp = XMPMeta() xmp.set_property(XMP_NS_PDFA_ID, "part", "2") xmp.set_property(XMP_NS_PDFA_ID, "conformance", "A") metadata_stream = xmp.serialize_to_str().encode("utf-8") metadata.Filter = [PdfName.FlateDecode] metadata.stream = zlib.compress(metadata_stream, 9).decode("latin-1") metadata.Length1 = len(metadata_stream) trailer.Root.Metadata = metadata with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [ QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n", "--newline-before-endstream" ] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([ path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile) ]) yield from run_command_async(cmd, process_semaphore)
args = parser.parse_args() # The shuffling magic even = PdfReader(args.evenFile[0]) odd = PdfReader(args.oddFile[0]) isEvenReversed = args.evenrev; isOddReversed = args.oddrev; all = PdfWriter() blank = PageMerge() blank.mbox = [0, 0, 612, 792] # 8.5 x 11 blank = blank.render() if isEvenReversed and not isOddReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[i]) all.addpage(even.pages[len(even.pages)-1-i]) elif isOddReversed and not isEvenReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[len(odd.pages)-1-i]) all.addpage(even.pages[i]) elif isEvenReversed and isOddReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[len(odd.pages)-1-i]) all.addpage(even.pages[len(even.pages)-1-i]) else: for x,y in zip(odd.pages, even.pages): all.addpage(x) all.addpage(y) all.write(args.resultFile[0])
def render(source, *, progress_cb=lambda x: None, expand_pages=True, template_alpha=0.3, only_annotated=False, black='black', white='white', gray=None, highlight=HIGHLIGHT_DEFAULT_COLOR): """Render a source document as a PDF file. source: The reMarkable document to be rendered. This may be - A filename or pathlib.Path to a zip file containing the document, such as is provided by the Cloud API. - A filename or pathlib.Path to a root-level file from the document, such as might be copied off the device directly. - An object implementing the Source API. See rmrl.sources for examples and further documentation. progress_cb: A function which will be called with a progress percentage between 0 and 100. The first 50% indicate rendering the annotations, and the second the merging of these into the base PDF file. If this callback raises an error, this function will abort gracefully and propagate the error up the stack. expand_pages: Boolean value (default True) indicating whether pages should be made larger, to reflect the view provided by the reMarkable device. template_alpha: Opacity of the template backgrounds in notebooks. 0 makes the templates invisible, 1 makes them fully dark. only_annotated: Boolean value (default False) indicating whether only pages with annotations should be output. black: A string giving the color to use as "black" in the document. Can be a color name or a hex string. Default: 'black' white: A string giving the color to use as "white" in the document. See `black` parameter for format. Default: 'white' gray: A string giving the color to use as "gray" in the document. See `black` parameter for format. Default: None, which means to pick an average between the "white" and "black" values. highlight: A string giving the color to use for the highlighter. See `black` parameter for format. """ colors = parse_colors(black, white, gray, highlight) vector = True # TODO: Different rendering styles source = sources.get_source(source) # If this is using a base PDF, the percentage is calculated # differently. uses_base_pdf = source.exists('{ID}.pdf') # Generate page information # If a PDF file was uploaded, but never opened, there may not be # a .content file. So, just load a barebones one with a 'pages' # key of zero length, so it doesn't break the rest of the # process. pages = [] if source.exists('{ID}.content'): with source.open('{ID}.content', 'r') as f: pages = json.load(f).get('pages', []) # Render each page as a pdf tmpfh = tempfile.TemporaryFile() pdf_canvas = canvas.Canvas(tmpfh, (PDFWIDTH, PDFHEIGHT)) # TODO: check pageCompression # Don't load all the pages into memory, because large notebooks # about 500 pages could use up to 3 GB of RAM. Create them by # iteration so they get released by garbage collector. changed_pages = [] annotations = [] for i in range(0, len(pages)): page = document.DocumentPage(source, pages[i], i, colors=colors) if source.exists(page.rmpath): changed_pages.append(i) page.render_to_painter(pdf_canvas, vector, template_alpha) annotations.append(page.get_grouped_annotations()) progress_cb((i + 1) / len(pages) * 50) pdf_canvas.save() tmpfh.seek(0) # This new PDF represents just the notebook. If there was a # parent PDF, merge it now. if uses_base_pdf and not changed_pages: # Since there is no stroke data, just return the PDF data progress_cb(100) log.info('exported pdf') return source.open('{ID}.pdf', 'rb') # PDF exists, stroke data exists, so mix them together. if uses_base_pdf: rmpdfr = PdfReader(tmpfh) basepdfr = PdfReader(source.open('{ID}.pdf', 'rb')) else: basepdfr = PdfReader(tmpfh) # Alias, which is used for annotations and layers. rmpdfr = basepdfr # If making a 'layered' PDF (with optional content groups, # OCGs), associate the annoatations with the layer. # This property list is put into the rmpdfr document, which # will not have any existing properties. ocgprop = IndirectPdfDict(OCGs=PdfArray(), D=PdfDict(Order=PdfArray())) for i in range(0, len(basepdfr.pages)): basepage = basepdfr.pages[i] rmpage = rmpdfr.pages[i] # Apply OCGs apply_ocg = False #TODO configurable? bool(int(QSettings().value( #'pane/notebooks/export_pdf_ocg'))) if apply_ocg: ocgorderinner = do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations) else: ocgorderinner = None # Apply annotations to the rmpage. This must come after # applying OCGs, because the annotation may belong to # one of those groups. apply_annotations(rmpage, annotations[i], ocgorderinner) # If this is a normal notebook with highlighting, # just add the annotations and forget about the rest, # which are page geometry transformations. if uses_base_pdf: merge_pages(basepage, rmpage, i in changed_pages, expand_pages) progress_cb(((i + 1) / rmpdfr.numPages * 50) + 50) # Apply the OCG order. The basepdf may have already had OCGs # and so we must not overwrite them. NOTE: there are other # properties that ought to be carried over, but this is the # minimum required. if apply_ocg: if '/OCProperties' in basepdfr.Root: basepdfr.Root.OCProperties.OCGs += ocgprop.OCGs basepdfr.Root.OCProperties.D.Order += ocgprop.D.Order else: basepdfr.Root.OCProperties = ocgprop stream = tempfile.SpooledTemporaryFile(SPOOL_MAX) pdfw = PdfWriter(stream) if not only_annotated: # We are writing out everything, so we can take this shortcut: pdfw.write(trailer=basepdfr) else: for i, page in enumerate(basepdfr.pages): if i in changed_pages: pdfw.addpage(page) pdfw.write() stream.seek(0) log.info('exported pdf') return stream
So she did an 8.5x11" output with 0.5" margin all around (actual size of useful area 7.5x10") and we scaled it up by 4.8. We also copy the Info dict to the new PDF. ''' import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict def adjust(page, margin=36, scale=4.8): info = PageMerge().add(page) x1, y1, x2, y2 = info.xobj_box viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin) page = PageMerge().add(page, viewrect=viewrect) page[0].scale(scale) return page.render() inpfn, = sys.argv[1:] outfn = 'poster.' + os.path.basename(inpfn) reader = PdfReader(inpfn) writer = PdfWriter(outfn) writer.addpage(adjust(reader.pages[0])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write()
def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw): with open(f, "rb") as inf: orig_imgdata = inf.read() output = img2pdf.convert(orig_imgdata, nodate=True, with_pdfrw=with_pdfrw) from pdfrw import PdfReader, PdfName, PdfWriter from pdfrw.py23_diffs import convert_load, convert_store x = PdfReader(PdfReaderIO(convert_load(output))) self.assertEqual(sorted(x.keys()), [PdfName.Info, PdfName.Root, PdfName.Size]) self.assertIn(x.Root.Pages.Count, ('1', '2')) if len(x.Root.Pages.Kids) == '1': self.assertEqual(x.Size, '7') self.assertEqual(len(x.Root.Pages.Kids), 1) elif len(x.Root.Pages.Kids) == '2': self.assertEqual(x.Size, '10') self.assertEqual(len(x.Root.Pages.Kids), 2) self.assertEqual(x.Info, {}) self.assertEqual(sorted(x.Root.keys()), [PdfName.Pages, PdfName.Type]) self.assertEqual(x.Root.Type, PdfName.Catalog) self.assertEqual(sorted(x.Root.Pages.keys()), [PdfName.Count, PdfName.Kids, PdfName.Type]) self.assertEqual(x.Root.Pages.Type, PdfName.Pages) orig_img = Image.open(f) for pagenum in range(len(x.Root.Pages.Kids)): # retrieve the original image frame that this page was # generated from orig_img.seek(pagenum) cur_page = x.Root.Pages.Kids[pagenum] ndpi = orig_img.info.get("dpi", (96.0, 96.0)) # In python3, the returned dpi value for some tiff images will # not be an integer but a float. To make the behaviour of # img2pdf the same between python2 and python3, we convert that # float into an integer by rounding. # Search online for the 72.009 dpi problem for more info. ndpi = (int(round(ndpi[0])), int(round(ndpi[1]))) imgwidthpx, imgheightpx = orig_img.size pagewidth = 72.0*imgwidthpx/ndpi[0] pageheight = 72.0*imgheightpx/ndpi[1] def format_float(f): if int(f) == f: return str(int(f)) else: return ("%.4f" % f).rstrip("0") self.assertEqual(sorted(cur_page.keys()), [PdfName.Contents, PdfName.MediaBox, PdfName.Parent, PdfName.Resources, PdfName.Type]) self.assertEqual(cur_page.MediaBox, ['0', '0', format_float(pagewidth), format_float(pageheight)]) self.assertEqual(cur_page.Parent, x.Root.Pages) self.assertEqual(cur_page.Type, PdfName.Page) self.assertEqual(cur_page.Resources.keys(), [PdfName.XObject]) self.assertEqual(cur_page.Resources.XObject.keys(), [PdfName.Im0]) self.assertEqual(cur_page.Contents.keys(), [PdfName.Length]) self.assertEqual(cur_page.Contents.Length, str(len(cur_page.Contents.stream))) self.assertEqual(cur_page.Contents.stream, "q\n%.4f 0 0 %.4f 0.0000 0.0000 cm\n" "/Im0 Do\nQ" % (pagewidth, pageheight)) imgprops = cur_page.Resources.XObject.Im0 # test if the filter is valid: self.assertIn( imgprops.Filter, [[PdfName.DCTDecode], [PdfName.JPXDecode], [PdfName.FlateDecode], [PdfName.CCITTFaxDecode]]) # test if the colorspace is valid self.assertIn( imgprops.ColorSpace, [PdfName.DeviceGray, PdfName.DeviceRGB, PdfName.DeviceCMYK]) # test if the image has correct size self.assertEqual(imgprops.Width, str(orig_img.size[0])) self.assertEqual(imgprops.Height, str(orig_img.size[1])) # if the input file is a jpeg then it should've been copied # verbatim into the PDF if imgprops.Filter in [[PdfName.DCTDecode], [PdfName.JPXDecode]]: self.assertEqual( cur_page.Resources.XObject.Im0.stream, convert_load(orig_imgdata)) elif imgprops.Filter == [PdfName.CCITTFaxDecode]: tiff_header = tiff_header_for_ccitt( int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4) imgio = BytesIO() imgio.write(tiff_header) imgio.write(convert_store( cur_page.Resources.XObject.Im0.stream)) imgio.seek(0) im = Image.open(imgio) self.assertEqual(im.tobytes(), orig_img.tobytes()) try: im.close() except AttributeError: pass elif imgprops.Filter == [PdfName.FlateDecode]: # otherwise, the data is flate encoded and has to be equal # to the pixel data of the input image imgdata = zlib.decompress( convert_store(cur_page.Resources.XObject.Im0.stream)) colorspace = imgprops.ColorSpace if colorspace == PdfName.DeviceGray: colorspace = 'L' elif colorspace == PdfName.DeviceRGB: colorspace = 'RGB' elif colorspace == PdfName.DeviceCMYK: colorspace = 'CMYK' else: raise Exception("invalid colorspace") im = Image.frombytes(colorspace, (int(imgprops.Width), int(imgprops.Height)), imgdata) if orig_img.mode == '1': self.assertEqual(im.tobytes(), orig_img.convert("L").tobytes()) elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"): self.assertEqual(im.tobytes(), orig_img.convert("RGB").tobytes()) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not # have the close() method try: im.close() except AttributeError: pass # now use pdfrw to parse and then write out both pdfs and check the # result for equality y = PdfReader(out) outx = BytesIO() outy = BytesIO() xwriter = PdfWriter() ywriter = PdfWriter() xwriter.trailer = x ywriter.trailer = y xwriter.write(outx) ywriter.write(outy) self.assertEqual(outx.getvalue(), outy.getvalue()) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # close() method try: orig_img.close() except AttributeError: pass
def go(inpfn, outfn): pages = PdfReader(inpfn).pages writer = PdfWriter() while pages: writer.addpage(get4(pages)) writer.write(outfn)
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather(*[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather(*[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray([0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height)] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather( *[make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages)]) with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n"] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile)]) yield from run_command_async(cmd, process_semaphore)
summary = PdfWriter() summary_path = OUTPUT_PATH_PREFIX.format(now=now, here=here) + ".pdf" failed_to_add_page_errors = [] for path, article, matching_authors in paths: try: with open(path, "rb") as rp: summary.addpage(PdfReader(rp).getPage(0)) except PdfParseError: print(f"Failed to add page from {path}") failed_to_add_page_errors.append((path, article, matching_authors)) with open(summary_path, "wb") as fp: summary.write(fp) if failed_to_add_page_errors or new_articles_with_errors: failure_summary = "However, there were some errors that occurred.\n\n" for count, (article, matching_authors) in enumerate(new_articles_with_errors, start=1): kwds = formatted_summary(article) kwds.update(count=count) summary = EXECUTIVE_SUMMARY_ARTICLE_FORMAT.format(**kwds).rstrip() failure = f"""Could not find PDF for this article: {summary}
except ImportError: print("Instale em seu sistema a biblioteca pdfrw!\n\n") print("sudo apt install python3-pdfrw\n") quit() # Limpa o \n do final da linha na lista def remove_quebra_de_linha(linha): return linha.replace('\n', '') # Vai ser o responsável em escrever o PDFao writer = PdfWriter() # Lista contendo arquivos pdf, linha a linha, com o caminho completo do sistema de arquivos # Deve estar algo como: # /home/meu_usuario/arquivos_pdf/arquivo1.pdf # /home/meu_usuario/arquivos_pdf/arquivo2.pdf pdf_list = open("my_pdfs.txt") # caminho completo do arquivo de saída. Dessa forma abaixo, gera na pasta do script pdefao = 'super.pdf' # Lê linha a linha da lista de pdfs e adiciona ao arquivao for arquivo in pdf_list: arquivo = remove_quebra_de_linha(arquivo) writer.addpages(PdfReader(arquivo).pages) writer.write(pdefao)
def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw): with open(f, "rb") as inf: orig_imgdata = inf.read() output = img2pdf.convert(orig_imgdata, nodate=True, with_pdfrw=with_pdfrw) from pdfrw import PdfReader, PdfName, PdfWriter from pdfrw.py23_diffs import convert_load, convert_store x = PdfReader(PdfReaderIO(convert_load(output))) self.assertEqual(sorted(x.keys()), [PdfName.Info, PdfName.Root, PdfName.Size]) self.assertIn(x.Root.Pages.Count, ('1', '2')) if len(x.Root.Pages.Kids) == '1': self.assertEqual(x.Size, '7') self.assertEqual(len(x.Root.Pages.Kids), 1) elif len(x.Root.Pages.Kids) == '2': self.assertEqual(x.Size, '10') self.assertEqual(len(x.Root.Pages.Kids), 2) self.assertEqual(x.Info, {}) self.assertEqual(sorted(x.Root.keys()), [PdfName.Pages, PdfName.Type]) self.assertEqual(x.Root.Type, PdfName.Catalog) self.assertEqual(sorted(x.Root.Pages.keys()), [PdfName.Count, PdfName.Kids, PdfName.Type]) self.assertEqual(x.Root.Pages.Type, PdfName.Pages) orig_img = Image.open(f) for pagenum in range(len(x.Root.Pages.Kids)): # retrieve the original image frame that this page was # generated from orig_img.seek(pagenum) cur_page = x.Root.Pages.Kids[pagenum] ndpi = orig_img.info.get("dpi", (96.0, 96.0)) # In python3, the returned dpi value for some tiff images will # not be an integer but a float. To make the behaviour of # img2pdf the same between python2 and python3, we convert that # float into an integer by rounding. # Search online for the 72.009 dpi problem for more info. ndpi = (int(round(ndpi[0])), int(round(ndpi[1]))) imgwidthpx, imgheightpx = orig_img.size pagewidth = 72.0*imgwidthpx/ndpi[0] pageheight = 72.0*imgheightpx/ndpi[1] def format_float(f): if int(f) == f: return str(int(f)) else: return ("%.4f" % f).rstrip("0") self.assertEqual(sorted(cur_page.keys()), [PdfName.Contents, PdfName.MediaBox, PdfName.Parent, PdfName.Resources, PdfName.Type]) self.assertEqual(cur_page.MediaBox, ['0', '0', format_float(pagewidth), format_float(pageheight)]) self.assertEqual(cur_page.Parent, x.Root.Pages) self.assertEqual(cur_page.Type, PdfName.Page) self.assertEqual(cur_page.Resources.keys(), [PdfName.XObject]) self.assertEqual(cur_page.Resources.XObject.keys(), [PdfName.Im0]) self.assertEqual(cur_page.Contents.keys(), [PdfName.Length]) self.assertEqual(cur_page.Contents.Length, str(len(cur_page.Contents.stream))) self.assertEqual(cur_page.Contents.stream, "q\n%.4f 0 0 %.4f 0.0000 0.0000 cm\n" "/Im0 Do\nQ" % (pagewidth, pageheight)) imgprops = cur_page.Resources.XObject.Im0 # test if the filter is valid: self.assertIn( imgprops.Filter, [PdfName.DCTDecode, PdfName.JPXDecode, PdfName.FlateDecode, [PdfName.CCITTFaxDecode]]) # test if the image has correct size self.assertEqual(imgprops.Width, str(orig_img.size[0])) self.assertEqual(imgprops.Height, str(orig_img.size[1])) # if the input file is a jpeg then it should've been copied # verbatim into the PDF if imgprops.Filter in [PdfName.DCTDecode, PdfName.JPXDecode]: self.assertEqual( cur_page.Resources.XObject.Im0.stream, convert_load(orig_imgdata)) elif imgprops.Filter == [PdfName.CCITTFaxDecode]: tiff_header = tiff_header_for_ccitt( int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4) imgio = BytesIO() imgio.write(tiff_header) imgio.write(convert_store( cur_page.Resources.XObject.Im0.stream)) imgio.seek(0) im = Image.open(imgio) self.assertEqual(im.tobytes(), orig_img.tobytes()) try: im.close() except AttributeError: pass elif imgprops.Filter == PdfName.FlateDecode: # otherwise, the data is flate encoded and has to be equal # to the pixel data of the input image imgdata = zlib.decompress( convert_store(cur_page.Resources.XObject.Im0.stream)) if imgprops.DecodeParms: if orig_img.format == 'PNG': pngidat, palette = img2pdf.parse_png(orig_imgdata) elif orig_img.format == 'TIFF' \ and orig_img.info['compression'] == "group4": offset, length = \ img2pdf.ccitt_payload_location_from_pil( orig_img) pngidat = orig_imgdata[offset:offset+length] else: pngbuffer = BytesIO() orig_img.save(pngbuffer, format="png") pngidat, palette = img2pdf.parse_png( pngbuffer.getvalue()) self.assertEqual(zlib.decompress(pngidat), imgdata) else: colorspace = imgprops.ColorSpace if colorspace == PdfName.DeviceGray: colorspace = 'L' elif colorspace == PdfName.DeviceRGB: colorspace = 'RGB' elif colorspace == PdfName.DeviceCMYK: colorspace = 'CMYK' else: raise Exception("invalid colorspace") im = Image.frombytes(colorspace, (int(imgprops.Width), int(imgprops.Height)), imgdata) if orig_img.mode == '1': self.assertEqual(im.tobytes(), orig_img.convert("L").tobytes()) elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"): self.assertEqual(im.tobytes(), orig_img.convert("RGB").tobytes()) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does # not have the close() method try: im.close() except AttributeError: pass # now use pdfrw to parse and then write out both pdfs and check the # result for equality y = PdfReader(out) outx = BytesIO() outy = BytesIO() xwriter = PdfWriter() ywriter = PdfWriter() xwriter.trailer = x ywriter.trailer = y xwriter.write(outx) ywriter.write(outy) self.assertEqual(compare_pdf(outx.getvalue(), outy.getvalue()), True) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # close() method try: orig_img.close() except AttributeError: pass
import sys import os import find_pdfrw from pdfrw import PdfReader, PdfWriter inpfn = sys.argv[1] rotate = sys.argv[2] ranges = sys.argv[3:] rotate = int(rotate) assert rotate % 90 == 0 ranges = [[int(y) for y in x.split('-')] for x in ranges] outfn = 'rotate.%s' % os.path.basename(inpfn) trailer = PdfReader(inpfn) pages = trailer.pages if not ranges: ranges = [[1, len(pages)]] for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0]-1, onerange[1]): pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or 0) + rotate) % 360 outdata = PdfWriter() outdata.trailer = trailer outdata.write(outfn)
usage: 4up.py my.pdf Creates 4up.my.pdf with a single output page for every 4 input pages. ''' import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge def get4(srcpages): scale = 0.5 srcpages = PageMerge() + srcpages x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) for i, page in enumerate(srcpages): page.scale(scale) page.x = x_increment if i & 1 else 0 page.y = 0 if i & 2 else y_increment return srcpages.render() inpfn, = sys.argv[1:] outfn = '4up.' + os.path.basename(inpfn) pages = PdfReader(inpfn).pages writer = PdfWriter(outfn) for index in range(0, len(pages), 4): writer.addpage(get4(pages[index:index + 4])) writer.write()
def get(self, format: str, path: str): """Handle the GET method call.""" if format != 'pdf': self.log.exception('format must be pdf') raise web.HTTPError(500, 'format must be pdf') self.config.PDFExporter.preprocessors = [thermohw.ExtractAttachmentsPreprocessor] self.config.PDFExporter.template_file = os.path.join(thermohw_dir, 'homework.tpl') self.config.PDFExporter.filters = {'convert_div': thermohw.convert_div, 'convert_raw_html': thermohw.convert_raw_html} self.config.PDFExporter.latex_count = 1 exporter = PDFExporter(config=self.config, log=self.log) exporter.writer.build_directory = '.' pdfs = [] path = path.strip('/').strip() paths = path.split('.ipynb') for path in paths: if not path: continue path += '.ipynb' # If the notebook relates to a real file (default contents manager), # give its path to nbconvert. ext_resources_dir: Union[str, None] basename: str os_path: str if hasattr(self.contents_manager, '_get_os_path'): os_path = self.contents_manager._get_os_path(path) ext_resources_dir, basename = os.path.split(os_path) else: ext_resources_dir = None model: Dict[str, str] = self.contents_manager.get(path=path) name: str = model['name'] if model['type'] != 'notebook': # not a notebook, redirect to files return FilesRedirectHandler.redirect_to_files(self, path) nb = model['content'] self.set_header('Last-Modified', model['last_modified']) # create resources dictionary mod_date: str = model['last_modified'].strftime(text.date_format) nb_title: str = os.path.splitext(name)[0] config_dir: str = self.application.settings['config_dir'] resource_dict: Dict[str, str] = { "metadata": { "name": nb_title, "modified_date": mod_date }, "config_dir": config_dir, } if ext_resources_dir: resource_dict['metadata']['path'] = ext_resources_dir output: bytes try: output, _ = exporter.from_notebook_node( nb, resources=resource_dict ) except Exception as e: self.log.exception("nbconvert failed: %s", e) raise web.HTTPError(500, "nbconvert failed: %s" % e) pdfs.append(io.BytesIO(output)) writer = PdfWriter() for pdf in pdfs: writer.addpages(PdfReader(pdf).pages) bio = io.BytesIO() writer.write(bio) bio.seek(0) output = bio.read() bio.close() # Force download if requested if self.get_argument('download', 'false').lower() == 'true': filename = 'final_output.pdf' self.set_header('Content-Disposition', 'attachment; filename="{}"'.format(filename)) # MIME type if exporter.output_mimetype: self.set_header('Content-Type', '{}; charset=utf-8'.format(exporter.output_mimetype)) self.set_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0') self.finish(output)
#!/usr/bin/env python ''' usage: subset.py my.pdf page[range] [page[range]] ... eg. subset.py 1-3 5 7-9 Creates subset.my.pdf ''' import sys import os from pdfrw import PdfReader, PdfWriter inpfn = sys.argv[1] ranges = sys.argv[2:] assert ranges, "Expected at least one range" ranges = ([int(y) for y in x.split('-')] for x in ranges) outfn = 'subset.%s' % os.path.basename(inpfn) pages = PdfReader(inpfn).pages outdata = PdfWriter(outfn) for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0], onerange[1]+1): outdata.addpage(pages[pagenum-1]) outdata.write()
from pdfrw import PdfReader, PdfWriter import os source_dir = os.getcwd() writer = PdfWriter() for item in os.listdir(source_dir): if item.endswith('pdf'): writer.addpages(PdfReader(item).pages) writer.write('result.pdf')
def popups_write_pdf(file): from pdfrw import PdfWriter w = PdfWriter(version='1.5', compress=pdf_popup_config['compress']) w.trailer = popup_pdf w.write(file)
letters = a, b, c""") parser.add_argument("--prefix", "-p", default="", help="prefix to the page labels") parser.add_argument("--firstpagenum", "-f", type=int, default=1, help="number to attribute to the first page of this index") options = parser.parse_args() reader = options.file if options.delete: labels = PageLabels() else: labels = PageLabels.from_pdf(reader) newlabel = PageLabelScheme(startpage=options.startpage - 1, style=options.type, prefix=options.prefix, firstpagenum=options.firstpagenum) labels.append(newlabel) # Write the new page labels to the PDF labels.write(reader) print("New labels to be written:") print("\n".join(map(str, labels))) writer = PdfWriter() writer.trailer = reader writer.write("/tmp/test.pdf")
for section in sections: print("++++++++++++++++++++++++++++++++++\n+ Adding section: %s\n+" % section) big_file = PdfWriter() files = os.listdir(section) files = sorted(files) for f in files: fpath = section + '/' + f if os.path.isfile(fpath) and fpath.endswith( 'pptx') and not f == 'Template.pptx': print("+ Incorporating: %s" % fpath) call([ "libreoffice", "--headless", "--invisible", "--convert-to", "pdf", fpath ]) pdf_file_name = f.replace('pptx', 'pdf') pdf_file = PdfReader(pdf_file_name) print("+ Adding pages from %s\n+" % pdf_file_name) big_file.addpages(pdf_file.pages) call(["mv", pdf_file_name, "temp"]) big_file.write('Part1_%s.pdf' % section) print("Done.")
import os import sys from pdfrw import PdfReader, PdfWriter if len(sys.argv) != 2: print("Usage: InvertOrder.py FILETOINVERT") sys.exit() filename = sys.argv[1] output = PdfWriter() for p in reversed(PdfReader(filename).pages): output.addpage(p) fname, fext = os.path.splitext(filename) outname = fname + "_inv" + fext print("Writing output to "+outname) output.write(outname)
import sys import argparse import itertools from pdfrw import PdfWriter, PdfReader parser = argparse.ArgumentParser(description='Interlaces two pdf to make one complete pdf.') parser.add_argument('front_pdf_loc', type=str, help="PDF of fronts of pages") parser.add_argument('back_pdf_loc', type=str, help="PDF of backs of pages") parser.add_argument('output_loc', type=str, nargs='?', default="output.pdf", help="Output location for interlaced PDF") args = parser.parse_args() output = PdfWriter() front_pdf = PdfReader(args.front_pdf_loc) back_pdf = PdfReader(args.back_pdf_loc) if len(front_pdf.pages) != len(back_pdf.pages): print("PDFs must have the same number of pages") sys.exit(1) output.addpages(itertools.chain.from_iterable(zip(front_pdf.pages, back_pdf.pages[::-1]))) output.write(args.output_loc)
var BALL_HEIGHT = %(BALL_HEIGHT)s; var BRICK_ROW_COUNT = %(BRICK_ROW_COUNT)s; var BRICK_COLUMN_COUNT = %(BRICK_COLUMN_COUNT)s; var BRICK_WIDTH = %(BRICK_WIDTH)s; var BRICK_HEIGHT = %(BRICK_HEIGHT)s; var BRICK_PADDING = %(BRICK_PADDING)s; var BRICK_OFFSET_BOTTOM = %(BRICK_OFFSET_BOTTOM)s; var BRICK_OFFSET_LEFT = %(BRICK_OFFSET_LEFT)s; %(script)s """ % locals()) page.Contents.stream = """ BT /F1 24 Tf 150 300 Td (Move your mouse down here!) Tj 40 -100 Td (also, README below...) Tj ET """ readme = PdfReader('README.pdf') out = PdfWriter() out.addpage(page) for readme_page in readme.pages: out.addpage(readme_page) out.write('breakout.pdf')
if args.path: path = args.path if args.verbose: print("Searching {} for PDF files.\n".format(path)) # Generate a list of file names (includes the full path) fileList = [] for filePath in glob(path + "/*.pdf"): if args.verbose: print("Found {}".format(filePath)) fileList.append(filePath) # sort the list in 'natural' order sortedFiles = natsorted(fileList) # loop through the list of PDFs, and add them to a new PDF outFile = PdfWriter() for pdf in sortedFiles: x = PdfReader(pdf) if args.verbose: print("Adding {} pages from {} to the combined file.".format( x.numPages, pdf)) outFile.addpages(x.pages) outFile.write(combinedFile) if args.verbose: m = PdfReader(combinedFile) print("\nCombined file created at {} with a total of {} pages.".format( combinedFile, m.numPages))
# Добавление QR-кода в многостраничный PDF документ from pdfrw import PdfReader, PdfWriter, PageMerge input_file = "source/Computer-Vision-Resources.pdf" output_file = "dist/Computer-Vision-Resources-QR-pages.pdf" watermark_file = "source/waksoft-QR-code.pdf" # определяем объекты чтения и записи reader_input = PdfReader(input_file) writer_output = PdfWriter() watermark_input = PdfReader(watermark_file) watermark = watermark_input.pages[0] # просматривать страницы одну за другой for current_page in range(len(reader_input.pages)): merger = PageMerge(reader_input.pages[current_page]) merger.add(watermark).render() # записать измененный контент на диск writer_output.write(output_file, reader_input)
So she did an 8.5x11" output with 0.5" margin all around (actual size of useful area 7.5x10") and we scaled it up by 4.8. We also copy the Info dict to the new PDF. ''' import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict def adjust(page, margin=36, scale=4.8): info = PageMerge().add(page) x1, y1, x2, y2 = info.xobj_box viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin) page = PageMerge().add(page, viewrect=viewrect) page[0].scale(scale) return page.render() inpfn, = sys.argv[1:] outfn = 'poster.' + os.path.basename(inpfn) reader = PdfReader(inpfn) writer = PdfWriter() writer.addpage(adjust(reader.pages[0])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write(outfn)
from pdfrw import PdfReader """ x = PdfReader('source/07922XXX2258-2017Apr13-2017May15.pdf') print x.keys() print x.Info print x.Root.keys() print len(x.pages) print x.pages[0] print x.pages[0].Contents print x.pages[0].Contents.stream """ #writing pdfs from pdfrw import PdfWriter writer = PdfWriter() #y.addpage(x.pages[0]) #y.write('out.pdf') for pdf_filename in pdf_filenames: writer.addpages(PdfReader(pdf_filename).pages) from pdfrw import IndirectPdfDict writer.trailer.Info = IndirectPdfDict( Title='pdf bundle', Author='Adobe', Subject='pdf', Creator='Adobe', ) writer.write('out.pdf')
for (srcpath, _, filenames) in os.walk('ramdisk/reference'): for name in filenames: if not name.endswith('.pdf'): continue src = os.path.join(srcpath, name) dst = src.replace('/reference/', '/tmp_results/') if not os.path.exists(dst): continue src_digest = get_digest(src) if not src_digest or src_digest not in expected: continue print src count += 1 trailer = make_canonical(PdfReader(src)) out = PdfWriter(tmp) out.write(trailer=trailer) match_digest = get_digest(tmp) if not match_digest: continue trailer = make_canonical(PdfReader(dst)) out = PdfWriter(tmp) out.write(trailer=trailer) if get_digest(tmp) != match_digest: continue goodcount += 1 print "OK" changes.append((src_digest, get_digest(dst))) print count, goodcount for stuff in changes:
#!/usr/bin/env python3 # USAGE: ./add_new_page.py $in_filepath $out_filepath import sys from fpdf import FPDF from pdfrw import PdfReader, PdfWriter IN_FILEPATH = sys.argv[1] OUT_FILEPATH = sys.argv[2] NEW_PAGE_INDEX = 1 # set to None to append at the end def new_page(): fpdf = FPDF() fpdf.add_page() fpdf.set_font("helvetica", size=36) fpdf.text(50, 50, "Hello!") reader = PdfReader(fdata=bytes(fpdf.output())) return reader.pages[0] writer = PdfWriter(trailer=PdfReader(IN_FILEPATH)) writer.addpage(new_page(), at_index=NEW_PAGE_INDEX) writer.write(OUT_FILEPATH)
def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw): with open(f, "rb") as inf: orig_imgdata = inf.read() output = img2pdf.convert(orig_imgdata, nodate=True, with_pdfrw=with_pdfrw) from io import StringIO, BytesIO from pdfrw import PdfReader, PdfName, PdfWriter from pdfrw.py23_diffs import convert_load, convert_store x = PdfReader(StringIO(convert_load(output))) self.assertEqual(sorted(x.keys()), [PdfName.Info, PdfName.Root, PdfName.Size]) self.assertEqual(x.Size, '7') self.assertEqual(x.Info, {}) self.assertEqual(sorted(x.Root.keys()), [PdfName.Pages, PdfName.Type]) self.assertEqual(x.Root.Type, PdfName.Catalog) self.assertEqual(sorted(x.Root.Pages.keys()), [PdfName.Count, PdfName.Kids, PdfName.Type]) self.assertEqual(x.Root.Pages.Count, '1') self.assertEqual(x.Root.Pages.Type, PdfName.Pages) self.assertEqual(len(x.Root.Pages.Kids), 1) self.assertEqual(sorted(x.Root.Pages.Kids[0].keys()), [PdfName.Contents, PdfName.MediaBox, PdfName.Parent, PdfName.Resources, PdfName.Type]) self.assertEqual(x.Root.Pages.Kids[0].MediaBox, ['0', '0', '115', '48']) self.assertEqual(x.Root.Pages.Kids[0].Parent, x.Root.Pages) self.assertEqual(x.Root.Pages.Kids[0].Type, PdfName.Page) self.assertEqual(x.Root.Pages.Kids[0].Resources.keys(), [PdfName.XObject]) self.assertEqual(x.Root.Pages.Kids[0].Resources.XObject.keys(), [PdfName.Im0]) self.assertEqual(x.Root.Pages.Kids[0].Contents.keys(), [PdfName.Length]) self.assertEqual(x.Root.Pages.Kids[0].Contents.Length, str(len(x.Root.Pages.Kids[0].Contents.stream))) self.assertEqual(x.Root.Pages.Kids[0].Contents.stream, "q\n115.0000 0 0 48.0000 0.0000 0.0000 cm\n/Im0 " "Do\nQ") imgprops = x.Root.Pages.Kids[0].Resources.XObject.Im0 # test if the filter is valid: self.assertIn( imgprops.Filter, [[PdfName.DCTDecode], [PdfName.JPXDecode], [PdfName.FlateDecode]]) # test if the colorspace is valid self.assertIn( imgprops.ColorSpace, [PdfName.DeviceGray, PdfName.DeviceRGB, PdfName.DeviceCMYK]) # test if the image has correct size orig_img = Image.open(f) self.assertEqual(imgprops.Width, str(orig_img.size[0])) self.assertEqual(imgprops.Height, str(orig_img.size[1])) # if the input file is a jpeg then it should've been copied # verbatim into the PDF if imgprops.Filter in [[PdfName.DCTDecode], [PdfName.JPXDecode]]: self.assertEqual( x.Root.Pages.Kids[0].Resources.XObject.Im0.stream, convert_load(orig_imgdata)) elif imgprops.Filter == [PdfName.FlateDecode]: # otherwise, the data is flate encoded and has to be equal to # the pixel data of the input image imgdata = zlib.decompress( convert_store( x.Root.Pages.Kids[0].Resources.XObject.Im0.stream)) colorspace = imgprops.ColorSpace if colorspace == PdfName.DeviceGray: colorspace = 'L' elif colorspace == PdfName.DeviceRGB: colorspace = 'RGB' elif colorspace == PdfName.DeviceCMYK: colorspace = 'CMYK' else: raise Exception("invalid colorspace") im = Image.frombytes(colorspace, (int(imgprops.Width), int(imgprops.Height)), imgdata) if orig_img.mode == '1': orig_img = orig_img.convert("L") elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"): orig_img = orig_img.convert("RGB") self.assertEqual(im.tobytes(), orig_img.tobytes()) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have # the close() method try: im.close() except AttributeError: pass # now use pdfrw to parse and then write out both pdfs and check the # result for equality y = PdfReader(out) outx = BytesIO() outy = BytesIO() xwriter = PdfWriter() ywriter = PdfWriter() xwriter.trailer = x ywriter.trailer = y xwriter.write(outx) ywriter.write(outy) self.assertEqual(outx.getvalue(), outy.getvalue()) # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # close() method try: orig_img.close() except AttributeError: pass
sections = [ 'Introduction', '1_Experimental_datasets', '2_Structured_data_from_literature', '3_Analysis_tools', '4_Simulation_environments', '5_Model_sharing', '6_Computing_infrastructure', '7_Open_source_initiatives', '8_Web_portals' ] #sections = ['Introduction','1_Experimental_datasets', '2_Structured_data_from_literature'] big_file = PdfWriter() for section in sections: for f in os.listdir(section): fpath = section + '/' + f if os.path.isfile(fpath) and fpath.endswith( 'pptx') and not f == 'Template.pptx': print("Incorporating: %s" % fpath) call([ "libreoffice", "--headless", "--invisible", "--convert-to", "pdf", fpath ]) pdf_file_name = f.replace('pptx', 'pdf') pdf_file = PdfReader(pdf_file_name) print(" Adding pages from %s" % pdf_file_name) big_file.addpages(pdf_file.pages) big_file.write('Part1.pdf') print("Done.")
alt_img = PdfDict(Type=PdfName.XObject, SubType=PdfName.Image, BitsPerComponent=8, ColorSpace=PdfName.DeviceRGB, Height=800, Width=600, Length=0, F=PdfDict(FS=PdfName.URL, F='https://chezsoi.org/lucas/ThePatch.jpg'), FFilter=PdfName.DCTDecode) alt_img.indirect = true alternates = PdfArray([PdfDict(DefaultForPrinting=True, Image=alt_img)]) alternates.indirect = true img_name = PdfName('Image-9960') img = img_kid.Resources.XObject[img_name] img.Alternates = alternates pdf_kid.Resources.XObject = PdfDict() pdf_kid.Resources.XObject[img_name] = img out = PdfWriter() out.addpage(pdf.pages[0]) out.write('out.pdf') # CONCLUSION: neither Adobe nor Sumatra readers visit the link... # It may be that readers do not follow this "Alternates" images spec anymore, that HTTPS is not supported, or that I made a mistake in the resulting PDF. # Anyway, I'm giving up. # However Canary Tokens use a similar technic that works well (with Adobe not Sumatra): https://github.com/sumatrapdfreader/sumatrapdf/issues/1696
def save_to_file(pdf_obj, file_path): short_path_for_logging = '/'.join(file_path.split('/')[-3:]) logger.debug("Saving to file: " + short_path_for_logging) y = PdfWriter() y.write(file_path, pdf_obj)