def renderToPdf(envLL, filename, sizex, sizey): """Renders the specified Box2d and zoom level as a PDF""" basefilename = os.path.splitext(filename)[0] mergedpdf = None for mapname in MAPNIK_LAYERS: print 'Rendering', mapname # Render layer PDF. localfilename = basefilename + '_' + mapname + '.pdf'; file = open(localfilename, 'wb') surface = cairo.PDFSurface(file.name, sizex, sizey) envMerc = LLToMerc(envLL) map = mapnik.Map(sizex, sizey) mapnik.load_map(map, mapname + ".xml") map.zoom_to_box(envMerc) mapnik.render(map, surface) surface.finish() file.close() # Merge with master. if not mergedpdf: mergedpdf = PdfFileWriter() localpdf = PdfFileReader(open(localfilename, "rb")) page = localpdf.getPage(0) mergedpdf.addPage(page) else: localpdf = PdfFileReader(open(localfilename, "rb")) page.mergePage(localpdf.getPage(0)) output = open(filename, 'wb') mergedpdf.write(output) output.close()
def add(request): """ Upload a document """ if request.method == "POST": form = AddDocumentForm(request.POST, request.FILES) if form.is_valid(): document = form.save(commit=False) document.user = request.user try: from pyPdf import PdfFileReader pdf = PdfFileReader(document.file) document.title = pdf.getDocumentInfo().title document.author = pdf.getDocumentInfo().author except: document.title = "( Insert title )" document.author = "( Insert author )" document.save() return HttpResponseRedirect('/documents/edit/' + str(document.id)) else: form = AddDocumentForm() context = { 'form': form, } return render_to_response('add.html', context, context_instance=RequestContext(request))
def test_read_pdf(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") pdffile = os.path.join(os.path.split(__file__)[ 0], "data", "1305.0445.pdf") assert os.path.exists(pdffile) with open(pdffile, "rb") as f: input1 = PdfFileReader(f) title = input1.getDocumentInfo().title traw = input1.getDocumentInfo().title_raw npage = input1.getNumPages() fLOG("title", title, "*", traw) fLOG("nb pages", npage) page = input1.getPage(0) cont = page.getContents() fLOG("cont", cont) for obj in page: fLOG("obj", obj, "*", obj.title()) annots = page.raw_get("/Annots") for a in annots: fLOG("annot", a, dir(a)) for i in page.items(): fLOG("item", i) text = page.extractText() fLOG("text---", text) assert " " in text assert "\n" in text if "algorithms: their inability" not in text: raise Exception(text)
def setMetadata(self, metadata): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ # TODO: date as "D:20090401124817-04'00'" ASN.1 for ModDate and CreationDate input_pdf = PdfFileReader(open(self.document.getUrl(), "rb")) output_pdf = PdfFileWriter() modification_date = metadata.pop("ModificationDate", None) if modification_date: metadata['ModDate'] = modification_date if type(metadata.get('Keywords', None)) is list: metadata['Keywords'] = metadata['Keywords'].join(' ') args = {} for key, value in list(metadata.items()): args[NameObject('/' + key.capitalize())] = createStringObject(value) output_pdf._info.getObject().update(args) for page_num in range(input_pdf.getNumPages()): output_pdf.addPage(input_pdf.getPage(page_num)) output_stream = io.BytesIO() output_pdf.write(output_stream) return output_stream.getvalue()
def rewrite(self, context, font={'name': 'Times-Roman', 'size': 11}): packet = StringIO.StringIO() # create a new PDF with Reportlab can = canvas.Canvas(packet, pagesize=letter) can.setFont(font['name'], font['size']) for i in context: can.drawString(i['x'], i['y'], i['value']) can.save() # move to the beginning of the StringIO buffer packet.seek(0) new_pdf = PdfFileReader(packet) # read your existing PDF existing_pdf = PdfFileReader(file(self.path, "rb")) output = PdfFileWriter() # merge the new file with the existing page = existing_pdf.getPage(0) page.mergePage(new_pdf.getPage(0)) output.addPage(page) # finally, write "output" to a real file outputStream = file(self.destination, "wb") output.write(outputStream) outputStream.close() return True
def _getPDFText(self, filename, d): logger.debug(u"filename: %s" % filename) newparatextlist = list() try: pdfDoc = PdfFileReader(file(filename, u"rb")) pdfDict = pdfDoc.getDocumentInfo() for x in pdfDict.keys(): d.addConceptKeyType(x[1:], pdfDict[x]) # c.logConcepts() for page in pdfDoc.pages: text = page.extractText() if not isinstance(text, str): unicodedata.normalize(u'NFKD', text).encode(u'ascii', u'ignore') logger.debug(u"PDF : %s" % text) newparatextlist.append(text + u". ") return newparatextlist except Exception, msg: logger.error(u"%s" % msg)
def createPDFHttpResponse(filepath, output_filename, user, access_time): """ Creates a HttpResponse from a watermarked PDF file. Watermark contains the user who accessed the document and the time of access. :param filepath: Path to the file :param output_filename: File name sent to the user :param user: :param access_time: :return: HttpResponse with the file content, or HttpResponseNotFound """ #Add access watermark buffer = StringIO() p = canvas.Canvas(buffer) p.drawString(0,0, "Downloaded by %s at %s" %(user, access_time.isoformat(' '))) p.showPage() p.save() buffer.seek(0) watermark = PdfFileReader(buffer) #Read the PDF to be accessed attachment = PdfFileReader(open(filepath, 'rb')) output = PdfFileWriter() #Attach watermark to each page for page in attachment.pages: page.mergePage(watermark.getPage(0)) output.addPage(page) response = HttpResponse(mimetype='application/pdf') response['Content-Disposition'] = 'inline; filename=%s' % output_filename.encode('utf-8') output.write(response) return response
def getPDFContents(path): # print % (input1.getDocumentInfo().title) try: content = "" pdf = PdfFileReader(file(path, "rb")) # get all pages and put them in a string if pdf.isEncrypted: print "%s is encrypted!" % path pass else: for i in range(0, pdf.getNumPages()): #i = pdf.getPage(i).extractText().lower() #for word in i: # if word in schlaglist: # cnt[word] +=1 # content += pdf.getPage(i).extractText().lower() + " \n" content = u" ".join(content.replace(u"\xa0", u" ").strip().split()) except ValueError as d: print d.args pass except Exception as e: print e.args pass return content
def split_chapters(*t_args): """ Split a large pdf into chunks (i.e. chapters) """ if len(t_args)>0: args=t_args[0] if len(args)<1: print "usage: utils_pdf split_chapters configfile" return from pyPdf import PdfFileWriter, PdfFileReader f = open(args[0]) P = json.loads(f.read()) f.close() input = PdfFileReader(file(P["source"], "rb")) i0 = P["first_chapter_index"] ends = P["chapters_ends"] for i in xrange(0, len(ends)): ch_num = i0+i fmt = P["chapter_fmt"] % (ch_num, ) output = PdfFileWriter() if not os.path.exists(P["outputdir"]): os.mkdir( P["outputdir"]) fn_out = "%s/%s%s" % (P["outputdir"], P["chapter_prefix"], fmt) j0 = P["firstpage"] if i==0 else ends[i-1] for j in xrange(j0, ends[i]): output.addPage(input.getPage(j)) outputStream = file(fn_out, "wb") output.write(outputStream) outputStream.close() print "wrote %s" % (fn_out,)
def parse_file(pdfFile,nameFile): pdfReader = PdfFileReader(file(pdfFile,"rb")) # read the names and emails from csv file names = get_names(nameFile) # create an instance in SMTP server smtp = smtplib.SMTP('localhost') # loop through the pages of the pdf # when a name is found, write pages to a new pdf until next name is found # then write the file and email as attachment i = 0 prevName = "" while i<pdfReader.getNumPages(): page = pdfReader.getPage(i) pageStr = page.extractText() # extract the pdf text for name in names.keys(): if pageStr.lower().find(name.lower())!=-1: if 'pdfWriter' in locals(): # send the current pdf send_email(smtp,pdfWriter,prevName,names) pdfWriter = PdfFileWriter() # create new pdfWriter file and add current page prevName = name # save off previous name break if 'pdfWriter' in locals(): pdfWriter.addPage(page) i+=1 # send the last file if 'pdfWriter' in locals(): send_email(smtp,pdfWriter,prevName,names) # quit the smtp server smtp.quit()
def add_omr_marks(self, pdf_data, is_latest_document): # Documentation # http://meteorite.unm.edu/site_media/pdf/reportlab-userguide.pdf # https://pythonhosted.org/PyPDF2/PdfFileReader.html # https://stackoverflow.com/a/17538003 # https://gist.github.com/kzim44/5023021 # https://www.blog.pythonlibrary.org/2013/07/16/ # pypdf-how-to-write-a-pdf-to-memory/ self.ensure_one() pdf_buffer = StringIO.StringIO() pdf_buffer.write(pdf_data) existing_pdf = PdfFileReader(pdf_buffer) output = PdfFileWriter() total_pages = existing_pdf.getNumPages() # print latest omr mark on latest pair page (recto) latest_omr_page = total_pages // 2 for page_number in range(total_pages): page = existing_pdf.getPage(page_number) # only print omr marks on pair pages (recto) if page_number % 2 is 0: is_latest_page = is_latest_document and \ page_number == latest_omr_page marks = self._compute_marks(is_latest_page) omr_layer = self._build_omr_layer(marks) page.mergePage(omr_layer) output.addPage(page) out_buffer = StringIO.StringIO() output.write(out_buffer) return out_buffer.getvalue()
def joinpdf(folder=TMPFOLDER,startpage=INDEX,outputname='freecad.pdf'): "creates one pdf file from several others, following order from startpage" if VERBOSE: print ("Building table of contents...") f = open(folder+os.sep+startpage+'.html') html = '' for line in f: html += line f.close() html = html.replace("\n"," ") html = html.replace("> <","><") html = re.findall("<ul.*/ul>",html)[0] pages = re.findall('href="(.*?)"',html) pages.insert(1,startpage+".html") result = PdfFileWriter() for p in pages: if exists(p[:-5]): if VERBOSE: print ('Appending',p) try: inputfile = PdfFileReader(open(folder+os.sep+p[:-5]+'.pdf','rb')) except: print ('Unable to append',p) else: for i in range(inputfile.getNumPages()): result.addPage(inputfile.getPage(i)) outputfile = open(OUTPUTPATH + os.sep + outputname,'wb') result.write(outputfile) outputfile.close() if VERBOSE: print ('Successfully created',OUTPUTPATH,os.sep,outputname)
def save(self, to): origin = self.get_origin() if not origin: raise RuntimeError("Please implement get_origin method or origin attribute") try: existing_pdf = PdfFileReader(file(origin, "rb")) except IOError: raise RuntimeError(u"Failed to open origin file") output = PdfFileWriter() for page_id, page_class in enumerate(self.pages): new_page = page_class(self.instance).save() base_page = existing_pdf.getPage(0) base_page.mergePage(new_page) output.addPage(base_page) if isinstance(to, basestring): outputStream = file(to, "wb") else: outputStream = to output.write(outputStream) outputStream.close()
def __call__(self, data, attachments=[], pages=None): self.rendered = {} for field, ctx in self.fields.items(): if "template" not in ctx: continue self.context = ctx kwargs = self.template_args(data) template = self.context["template"] try: rendered_field = template.render(**kwargs) except Exception as err: logger.error("%s: %s %s", field, template, err) else: # Skip the field if it is already rendered by filter if field not in self.rendered: self.rendered[field] = rendered_field filled = PdfFileReader(self.exec_pdftk(self.rendered)) for pagenumber, watermark in self.watermarks: page = filled.getPage(pagenumber) page.mergePage(watermark) output = PdfFileWriter() pages = pages or xrange(filled.getNumPages()) for p in pages: output.addPage(filled.getPage(p)) for attachment in attachments: output.addBlankPage().mergePage(attachment.pdf()) return output
class cleanpdf: def __init__(self,pathFile): self.pathFile = pathFile self.inputFile = file(self.pathFile,"rb") self.pdfInput = PdfFileReader(self.inputFile) self.pyPdfOutput = PdfFileWriter() self.dataToUpdate = self.pyPdfOutput._info.getObject() self.__modifyData() self.__copyPDF() def __modifyData(self): for data in self.dataToUpdate: self.dataToUpdate[data] = createStringObject(('<h1 onmouseover=alert(1)>').encode('ascii')) def __copyPDF(self): for page in range(0,self.pdfInput.getNumPages()): self.pyPdfOutput.addPage(self.pdfInput.getPage(page)) outputFile = file(self.__changeName(),"wb") self.pyPdfOutput.write(outputFile) def __changeName(self): newName = self.pathFile[0:self.pathFile.rfind(".")]+"5.pdf" return newName
def test_cat(self): """Make sure files are properly concatenated.""" check_call([STAPLER, 'cat', ONEPAGE_PDF, FIVEPAGE_PDF, self.outputfile]) self.assert_(os.path.isfile(self.outputfile)) pdf = PdfFileReader(file(self.outputfile, 'rb')) self.assertEqual(pdf.getNumPages(), 6)
def split_pset(): if (not options.pset or not options.probs): print_err_and_die("You must enter both arguements! run with -h for help") path = "pset%s/latex/"%options.pset try: filename = "%spset%s_answers.pdf"%(path, options.pset) inp = PdfFileReader(file(filename, "rb")) except IOError: print_err_and_die("Error! File, %s was not found." % filename) ##loop over user input and break up pdf questionNum = 1 probs = options.probs.split(",") for prob in probs: print "Processing question", questionNum prob = prob.strip() #kill whitespace out = PdfFileWriter() pages = get_pages(prob, inp.getNumPages()) for page in pages: print "page num", str(page) out.addPage(inp.getPage(int(page)-1)) outStream = file("%spset%s-%s_answer.pdf"%(path, options.pset, questionNum), "wb") out.write(outStream) outStream.close() questionNum +=1 print "Done!"
def select(filesandranges, outputfilename, verbose): if verbose: print (str(filesandranges)+"\noutput: "+str(outputfilename)) for i in range(len(filesandranges)): if not os.path.exists(filesandranges[i]['name']): halp() print ("error: "+filesandranges[i]['name']+" does not exist... exiting nao") sys.exit(2) # pdf file is no pdf file... if os.path.exists(outputfilename): halp() print ("error: "+filesandranges[i]['name']+" does already exist... exiting nao") sys.exit(2) # pdf file is no pdf file... output = PdfFileWriter() try: for pdf in filesandranges: fiel = PdfFileReader(file(pdf["name"], "rb")) for pagenr in pdf["pages"]: if (not (pagenr > fiel.getNumPages()) and not(pagenr < 1)): output.addPage(fiel.getPage(pagenr-1)) else: print("one or more pages are not in the chosen PDF") halp() sys.exit(3) #wrong pages or ranges except: halp() sys.exit(2) # pdf file is no pdf file...h if (not os.path.exists(outputfilename)): outputStream = file(outputfilename, "wb") output.write(outputStream) outputStream.close() else: print ("file exists, discontinuing operation")
def delete(filesandranges, outputfilename, verbose): for i in range(len(filesandranges)): if not os.path.exists(filesandranges[i]['name']): halp() print ("error: "+filesandranges[i]['name']+" does not exist... exiting nao") sys.exit(2) # pdf file is no pdf file... if os.path.exists(outputfilename): halp() print ("error: "+filesandranges[i]['name']+" does already exist... exiting nao") sys.exit(2) # pdf file is no pdf file... output = PdfFileWriter() try: for pdf in filesandranges: print (pdf["name"]) fiel = PdfFileReader(file(pdf["name"], "rb")) for pagenr in range(1,fiel.getNumPages()+1): if (pagenr not in pdf["pages"]): output.addPage(fiel.getPage(pagenr-1)) # else: # print ("skipping page nr: "+str(pagenr)) except: halp() sys.exit(2) # pdf file is no pdf file... if (not os.path.exists(outputfilename)): outputStream = file(outputfilename, "wb") output.write(outputStream) outputStream.close() else: print ("file exists, discontinuing operation")
def getNPersonal(paper): #print paper.title pdfLinks = paper.links for link in pdfLinks: try: if link.title == 'pdf': pdfURL = link['href'] break except AttributeError: continue try: rFile = urlopen(Request(pdfURL)).read() mFile = StringIO(rFile) pdfFile = PdfFileReader(mFile) nPages = pdfFile.getNumPages() thisNPersonal = 0 for page in range(0, nPages): pageStr = pdfFile.getPage(page).extractText().lower() thisNPersonal += pageStr.count(' we ') thisNPersonal += pageStr.count(' i ') except: print "Error reading file" return -1 thisNPersonal = 0 if thisNPersonal == 1 else thisNPersonal print thisNPersonal return thisNPersonal
def process_file(f): """Splits the file into parts if necessary, then adds it to the global queue. """ global file_queue filename = path_to_watch + "/" + f # Non-pdfs are not supported if (filename[-4:] != ".pdf"): log("Not a valid PDF file.") return try: fp = file(filename, 'rb') pdf_f = PdfFileReader(fp) except IOError as e: log("ERROR: Unable to process file "+filename) log(str(e)) return except e: log("ERROR: Unable to read PDF File") log(str(e)) return if pdf_f.getNumPages() > (10 + real_leeway): split_file(pdf_f, filename) else: file_queue.append(filename) fp.close()
def pdf(coursesid,examsid): ''' Creates a blank PDF of this exam ''' # TODO: Obviously fix this up to generate actual PDFs; this is just a proof of concept from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter from pyPdf import PdfFileWriter, PdfFileReader from io import BytesIO output = BytesIO() p = canvas.Canvas(output, pagesize=letter) p.drawString(100, 100, 'Hello') p.save() output.seek(0) new_pdf = PdfFileReader(output) existing_pdf = PdfFileReader(open('/home/treece/src/web/bubbleck/res/Template.pdf', 'rb')) out = PdfFileWriter() page = existing_pdf.getPage(0) page.mergePage(new_pdf.getPage(0)) out.addPage(page) a = BytesIO() pdf_out = out.write(a) response = make_response(pdf_out) response.headers['Content-Disposition'] = "filename='sakulaci.pdf" response.mimetype = 'application/pdf' return response
def choose_file(self,widget,data=None): global textbuffer dialog = gtk.FileChooserDialog("Open..", None, gtk.FILE_CHOOSER_ACTION_OPEN, (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OPEN, gtk.RESPONSE_OK)) dialog.set_default_response(gtk.RESPONSE_OK) filter = gtk.FileFilter() filter.set_name("PDF files") filter.add_pattern("*.pdf") dialog.add_filter(filter) response = dialog.run() if response == gtk.RESPONSE_OK: print dialog.get_filename(), 'selected' from pyPdf import PdfFileWriter, PdfFileReader pdf = PdfFileReader(file("kpeng.pdf", "rb")) content="" for i in range(0, pdf.getNumPages()): # Extract text from page and add to content content += pdf.getPage(i).extractText() + "/n" # Collapse whitespace content = " ".join(content.replace(u"/xa0", " ").strip().split()) textbuffer.set_text(content); elif response == gtk.RESPONSE_CANCEL: print 'Closed, no files selected' dialog.destroy()
def output(self): # get the output filename using the file dialog (out_filename, filter) = \ QFileDialog.getSaveFileName(parent = self, caption = self.tr(u'Export'), dir = '', filter = self.tr('pdf (*.pdf)')) # file IO out_file = open(out_filename, 'wb') in_file = open(self.in_filename, 'rb') in_reader = PdfFileReader(in_file) out_writer = PdfFileWriter() # extract input pages_string = self.pages_line_edit.text() # Get the indices of pages to extract pages = pages_parser(in_reader.getNumPages()).parse(pages_string) # append pages to output writer for page_index in pages: out_writer.addPage(in_reader.getPage(page_index)) # write to file out_writer.write(out_file) # close files in_file.close() out_file.close()
def showpdf(request): sign = os.path.join(settings.MEDIA_ROOT, "signature.png") mimetypes.init() response = None if 'f' in request.GET: fr = open(os.path.join(settings.MEDIA_ROOT,'pdffiles','extracted','%s' % request.GET['f']), "rb") imgTemp = StringIO() imgDoc = canvas.Canvas(imgTemp) if request.GET['o'] == 'l': imgDoc.drawImage(sign, 529, 40, 290/2, 154/2) else: imgDoc.drawImage(sign, 70, 40, 290/2, 154/2) imgDoc.save() overlay = PdfFileReader(StringIO(imgTemp.getvalue())).getPage(0) page = PdfFileReader(fr).getPage(0) page.mergePage(overlay) pdf_out = PdfFileWriter() pdf_out.addPage(page) response = HttpResponse(mimetype='application/pdf') response['Content-Disposition'] = 'attachment; filename=%s' % request.GET['f'] pdf_out.write(response) return response
def read_neb_enzyme_price_list(): # throws URLError, IOError price_list = urllib2.urlopen(NEB_PRICE_LIST_URL) file_buffer = StringIO(price_list.read()) reader = PdfFileReader(file_buffer) enzymes = [] for p in range(reader.getNumPages()): # fi/fl misread hacks-- little nasty in here-- poor PDF read for match in NEB_PRICE_LINE_RE.finditer(reader.getPage(p).extractText().replace(u'\u02dc','fi').replace(u'˚','fl')): # format of the groups will be: name prefix, lastletter(+supplement)+small_cost, supplement, large_cost, small_unit, large_unit name_prefix, transition, supplement, large_cost, small_unit, large_unit = match.groups() if supplement: carryover = transition.index(supplement)+len(supplement) name = "%s%s" % (name_prefix, transition[:carryover]) small_cost = int_comma(transition[carryover:]) else: name = "%s%s" % (name_prefix, transition[0]) small_cost = int_comma(transition[1:]) large_cost = int_comma(large_cost) small_unit = int_comma(small_unit) large_unit = int_comma(large_unit) enzymes.append((name, small_cost, large_cost, small_unit, large_unit)) return sorted(enzymes, key=operator.itemgetter(0))
def add_guides(self): pdf_in = PdfFileReader(open('sig.pdf', 'rb')) pdf_out = PdfFileWriter() for i in xrange(pdf_in.getNumPages()): page = pdf_in.getPage(i) if not i: guides = StringIO() if self.args.longarm: create_pdf( guides, a4lwidth_pt, a4lheight_pt, generate_longarm()) else: if self.args.a5: w, h = a5width_pt, a5height_pt else: w, h = a4lwidth_pt, a4lheight_pt create_pdf(guides, w, h, generate_shortarm( self.args.a5, bool(self.args.signature))) pdf_guides = PdfFileReader(guides) page.mergePage(pdf_guides.getPage(0)) pdf_out.addPage(page) pdf_out.write(open('sigs.pdf', 'wb'))
def watermark( self, pdfStr, watermarkFile, spec ): # Read the watermark- and document pdf file inputWatermark = PdfFileReader( file( watermarkFile, "rb" ) ) generatedPdf = PdfFileReader( pdfStr ) outputPdf = PdfFileWriter() # flag for the first page of the source file firstPage = True # Loop over source document pages and merge with the first page of the watermark # file. watermarkPage = inputWatermark.getPage(0) for page in generatedPdf.pages: if (spec == Mark.FIRST_PAGE and firstPage) or spec == Mark.ALL_PAGES: # deep copy the watermark page here, otherwise the watermark page # gets merged over and over because p would only be a reference p = copy.copy( watermarkPage ) p.mergePage( page ) outputPdf.addPage( p ) firstPage = False else: outputPdf.addPage(page) if self.outputFile: # Write to outputfile outputStream = file( self.outputFile, "wb" ) outputPdf.write( outputStream ) outputStream.close() return self.outputFile else: stringIO = StringIO.StringIO(); outputPdf.write( stringIO ) return stringIO.getvalue()
def generate(donor): os.system('mkdir -p output') donor_url = donor.replace(' ','%20') page1 = 'output/%s1' % (donor.replace(' ','-').lower()) page2 = 'output/%s2' % (donor.replace(' ','-').lower()) combined = 'output/%s.pdf' % (donor.replace(' ','-').lower()) if os.path.exists(combined): return os.system('cp "%s" "%s.svg"' % (page1_svg, page1)) os.system('sed "s|/France/|/%s/|" "%s" > "%s.svg"' % (donor_url, page1_svg, page1)) os.system('inkscape --file="%s.svg" --verb=za.co.widgetlabs.update --verb=FileSave --verb=FileQuit 2> /dev/null' % (page1)) os.system('inkscape --file="%s.svg" --export-pdf="%s.pdf" 2> /dev/null' % (page1, page1)) os.system('cp "%s" "%s.svg"' % (page2_svg, page2)) os.system('sed "s|/France/|/%s/|" "%s" > "%s.svg"' % (donor_url, page2_svg, page2)) os.system('inkscape --file="%s.svg" --verb=za.co.widgetlabs.update --verb=FileSave --verb=FileQuit 2> /dev/null' % (page2)) os.system('inkscape --file="%s.svg" --export-pdf="%s.pdf" ' % (page2, page2)) # Merge pages input1 = PdfFileReader(file('%s.pdf' % (page1), 'rb')) input2 = PdfFileReader(file('%s.pdf' % (page2), 'rb')) output = PdfFileWriter() output.addPage(input1.getPage(0)) output.addPage(input2.getPage(0)) outputStream = file(combined, 'wb') output.write(outputStream) outputStream.close() sleep(2)
def main(): """ """ # Parse command line pdf_files = sys.argv[1:] if len(pdf_files) == 0: print __usage__ sys.exit() # Make sure there is more than one pdf file if len(pdf_files) == 1: print "In the spirit of gnu tar, this script cowardly refuses to" print "combine one pdf file!" sys.exit() # Create unique name for output file localtime = time.localtime() localtime = [str(x) for x in localtime] localtime = [x.zfill(2) for x in localtime] localtime[0] = localtime[0].zfill(4) output_file = "%s-%s-%s_%s-%s-%s.pdf" % tuple(localtime[:6]) # Combine pdf files in order output = PdfFileWriter() for pdf in pdf_files: input = PdfFileReader(file(pdf,"rb")) num_pages = input.getNumPages() for i in range(num_pages): output.addPage(input.getPage(i)) # Write final pdf stream = file(output_file,"wb") output.write(stream) stream.close()
"es-co" : "spa", "es" : "spa", "de-de" : "deu", "fr-fr" : "fra", "fr-ca" : "fra" } # dictionary for /Root/Lang 1 - except; 2 - a file have not /Root/Lang; 3 - /Root/Lang = ''; 4 - language ans_list = dict() # dir of folder and filter for pdf files files = [f for f in os.listdir('trainPDF') if os.path.isfile(os.path.join('trainPDF', f))] files = list(filter(lambda f: f.endswith(('.pdf','.PDF')), files)) f = open("Langs.txt", "w") for filepdf in files: try: name = 'IMAGES/'+filepdf.replace('pdf','jpg') pdfFile = PdfFileReader(file('trainPDF/'+filepdf, 'rb')) catalog = pdfFile.trailer['/Root'].getObject() if catalog.has_key("/Lang"): value = 4 lang = catalog['/Lang'].getObject() if (lang == ''): value = 3 f.write(filepdf+" "+lang+" value = "+str(value)+"\n") ans_list.update( {name : [value,'None']} ) else: lang = lang.lower() language = lan_lst.get(lang) f.write(filepdf+" "+lang+" => "+language+" value = "+str(value)+"\n") ans_list.update( {name : [value,language]} ) else: value = 2
def create_source_pdf(self, cr, uid, ids, data, report_xml, context=None): if not context: context = {} registry = openerp.registry(cr.dbname) attach = report_xml.attachment if attach: objs = self.getObjects(cr, uid, ids, context) results = [] for obj in objs: aname = eval(attach, {'object': obj, 'time': time}) result = False if report_xml.attachment_use and aname and context.get( 'attachment_use', True): aids = registry['ir.attachment'].search( cr, uid, [('datas_fname', '=', aname + '.pdf'), ('res_model', '=', self.table), ('res_id', '=', obj.id)]) if aids: brow_rec = registry['ir.attachment'].browse( cr, uid, aids[0]) if not brow_rec.datas: continue d = base64.decodestring(brow_rec.datas) results.append((d, 'pdf')) continue result = self.create_single_pdf(cr, uid, [obj.id], data, report_xml, context) if not result: return False if aname: try: name = aname + '.' + result[1] # Remove the default_type entry from the context: this # is for instance used on the account.account_invoices # and is thus not intended for the ir.attachment type # field. ctx = dict(context) ctx.pop('default_type', None) registry['ir.attachment'].create( cr, uid, { 'name': aname, 'datas': base64.encodestring(result[0]), 'datas_fname': name, 'res_model': self.table, 'res_id': obj.id, }, context=ctx) except Exception: #TODO: should probably raise a proper osv_except instead, shouldn't we? see LP bug #325632 _logger.error( 'Could not create saved report attachment', exc_info=True) results.append(result) if results: if results[0][1] == 'pdf': from pyPdf import PdfFileWriter, PdfFileReader output = PdfFileWriter() for r in results: reader = PdfFileReader(cStringIO.StringIO(r[0])) for page in range(reader.getNumPages()): output.addPage(reader.getPage(page)) s = cStringIO.StringIO() output.write(s) return s.getvalue(), results[0][1] return self.create_single_pdf(cr, uid, ids, data, report_xml, context)
#!/usr/bin/env python import copy, sys from pyPdf import PdfFileWriter, PdfFileReader input = PdfFileReader(sys.stdin) output = PdfFileWriter() for p in [input.getPage(i) for i in range(0, input.getNumPages())]: q = copy.copy(p) (w, h) = p.mediaBox.upperRight p.mediaBox.upperRight = (w / 2, h) q.mediaBox.upperLeft = (w / 2, h) output.addPage(p) output.addPage(q) output.write(sys.stdout)
prog='crop', description='"%(prog)s" split pdfs', ) p.add_argument( '-i', '--input', type=str, required=True, help='Input pdf', ) return p if __name__ == '__main__': p = create_parsers() args = p.parse_args() input_filename = args.input output_filename = os.path.splitext(input_filename)[0] output_extension = os.path.splitext(input_filename)[-1] inputpdf = PdfFileReader(open(input_filename, "rb")) for i in xrange(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) with open(output_filename + str(i) + output_extension, "wb") as outputStream: output.write(outputStream)
#!/usr/bin/python import sys from os import system, remove from tempfile import mkstemp import random from pyPdf import PdfFileWriter, PdfFileReader # read input pdf and instantiate output pdf output = PdfFileWriter() input1 = PdfFileReader(file(sys.argv[1], "rb")) # construct and shuffle page number list pages = list(range(input1.getNumPages())) random.shuffle(pages) # display new sequence print 'Reordering pages according to sequence:' print pages # add the new sequence of pages to output pdf if len(pages) > 0: output.addPage(input1.getPage(pages[0])) # write the output pdf to file [fh, tmpfile] = mkstemp(suffix='.pdf') print tmpfile outputStream = file(tmpfile, 'wb')
def analizar_file(fichero): ext = fichero.split('.')[-1] extension = magic.from_file(fichero) if 'PDF' in extension: #Procederemos a analizar metadatos de un fichero pdf metadata_pdf = {} tipo_metadatos = [ 'Title', 'CreationDate', 'Author', 'Producer', 'Creator', 'ModDate', 'Company', 'Comments', 'Keywords', 'SourceModified', 'Subject' ] try: pdf_toread = PdfFileReader(open(fichero, "rb")) except: return metadata_pdf pdf_info = pdf_toread.getDocumentInfo() for i in tipo_metadatos: metadata_pdf.update({i: '#'}) for k, v in pdf_info.iteritems(): metadata_pdf.update({ unidecode.unidecode(unicode(k.split('/')[1])): unidecode.unidecode(unicode(v)) }) #metadata_pdf.append(v) metadata_pdf.update({'Fichero': fichero}) metadata_pdf.update({'Tipo': 'PDF'}) return metadata_pdf if ext == 'doc' or ext == 'ppt' or ext == 'xls': #Es un formato antiguo de fichero Office, no son .zip, hay que analizarlos de otra manera. info = magic.from_file(fichero) #print info title = re.findall(r'Title:.*', info) if len(title) > 0: title = title[0].split(':')[1].split(',')[0] else: title = '#' author = re.findall(r'Author:.*', info) if len(author) > 0: author = author[0].split(':')[1].split(',')[0] else: author = '#' lastsavedby = re.findall(r'Last Saved By:.*', info) if len(lastsavedby) > 0: lastsavedby = lastsavedby[0].split(':')[1].split(',')[0] else: lastsavedby = '#' revision = re.findall(r'Revision Number:.*', info) if len(revision) > 0: revision = revision[0].split(':')[1].split(',')[0] else: revision = '#' aplication = re.findall(r'Creating Application:.*', info) if len(aplication) > 0: aplication = aplication[0].split(':')[1].split(',')[0] else: aplication = '#' created = re.findall(r'Create Time/Date:.*', info) if len(created) > 0: created = created[0].split(':')[1].split(',')[0] else: created = '#' lastsaved = re.findall(r'Saved Time/Date:.*', info) if len(lastsaved) > 0: lastsaved = lastsaved[0].split(':')[1].split(',')[0] else: lastsaved = '#' pages = re.findall(r'Pages:.*', info) if len(pages) > 0: pages = pages[0].split(':')[1].split(',')[0] else: pages = '#' words = re.findall(r'Words:.*', info) if len(words) > 0: words = words[0].split(':')[1].split(',')[0] else: words = '#' chars = re.findall(r'Characters:.*', info) if len(chars) > 0: chars = chars[0].split(':')[1].split(',')[0] else: chars = '#' lastprinted = re.findall(r'Last Printed:.*', info) if len(lastprinted) > 0: lastprinted = lastprinted[0].split(':')[1].split(',')[0] else: lastprinted = '#' res = { 'Fichero': fichero, 'Tipo': ext, 'creator': author, 'lastModifiedBy': lastsavedby, 'created': created, 'modified': lastsaved, 'title': title, 'revision': revision, 'lastPrinted': lastprinted, 'keywords': '#', 'Application': aplication, 'Paginas': pages, 'Palabras': words, 'Caracteres': chars, 'Lineas': '#', 'Parrafos': '#', 'Slides': '#', 'PresentationFormat': '#' } return res if 'Word' in extension or 'Excel' in extension or 'PowerPoint' in extension: try: zf = zipfile.ZipFile(fichero) except: return {} #Analizamos el fichero core.xml y sacamos metadatos de ahi. core_xml = zf.read('docProps/core.xml') xmlns_cp = re.findall(r'xmlns:cp="https?:.*"', core_xml) xmlns_cp = xmlns_cp[0].split('"')[1] #print xmlns_cp xmlns_dc = re.findall(r'xmlns:dc="https?:.*"', core_xml) xmlns_dc = xmlns_dc[0].split('"')[1] #print xmlns_dc xmlns_dcterms = re.findall(r'xmlns:dcterms="https?:.*"', core_xml) xmlns_dcterms = xmlns_dcterms[0].split('"')[1] #print xmlns_dcterms doc = lxml.etree.fromstring(core_xml) # Ya hemos creado las variables para crear el diccionario namespace ns = {'dc': xmlns_dc, 'dcterms': xmlns_dcterms, 'cp': xmlns_cp} # Buscamos los metadatos en core.xml creator = doc.xpath('//dc:creator', namespaces=ns) if len(creator) > 0: creator = unidecode.unidecode(unicode(creator[0].text)) else: creator = '#' lastModifiedBy = doc.xpath('//cp:lastModifiedBy', namespaces=ns) if len(lastModifiedBy) > 0: lastModifiedBy = unidecode.unidecode( unicode(lastModifiedBy[0].text)) else: lastModifiedBy = '#' created = doc.xpath('//dcterms:created', namespaces=ns) if len(created) > 0: created = unidecode.unidecode(unicode(created[0].text)) else: created = '#' modified = doc.xpath('//dcterms:modified', namespaces=ns) if len(modified) > 0: modified = unidecode.unidecode(unicode(modified[0].text)) else: modified = '#' title = doc.xpath('//dc:title', namespaces=ns) if len(title) > 0: title = unidecode.unidecode(unicode(title[0].text)) else: title = '#' revision = doc.xpath('//cp:revision', namespaces=ns) if len(revision) > 0: revision = unidecode.unidecode(unicode(revision[0].text)) else: revision = '#' lastPrinted = doc.xpath('//cp:lastPrinted', namespaces=ns) if len(lastPrinted) > 0: lastPrinted = unidecode.unidecode(unicode(lastPrinted[0].text)) else: lastPrinted = '#' keywords = doc.xpath('//cp:keywords', namespaces=ns) if len(keywords) > 0: keywords = unidecode.unidecode(unicode(keywords[0].text)) else: keywords = '#' #Analizamos el fichero app.xml y sacamos metadatos de ahi. app_xml = zf.read('docProps/app.xml') #print app_xml Aplicacion = re.findall(r'<Application>.*</Application>', app_xml) if len(Aplicacion) > 0: Aplicacion = Aplicacion[0].split('>')[1].split('<')[0] else: Aplicacion = '#' Paginas = re.findall(r'<Pages>.*</Pages>', app_xml) if len(Paginas) > 0: Paginas = Paginas[0].split('>')[1].split('<')[0] else: Paginas = '#' Palabras = re.findall(r'<Words>.*</Words>', app_xml) if len(Palabras) > 0: Palabras = Palabras[0].split('>')[1].split('<')[0] else: Palabras = '#' Caracteres = re.findall(r'<Characters>.*</Characters>', app_xml) if len(Caracteres) > 0: Caracteres = Caracteres[0].split('>')[1].split('<')[0] else: Caracteres = '#' Lineas = re.findall(r'<Lines>.*</Lines>', app_xml) if len(Lineas) > 0: Lineas = Lineas[0].split('>')[1].split('<')[0] else: Lineas = '#' Parrafos = re.findall(r'<Paragraphs>.*</Paragraphs>', app_xml) if len(Parrafos) > 0: Parrafos = Parrafos[0].split('>')[1].split('<')[0] else: Parrafos = '#' Slides = re.findall(r'<Slides>.*</Slides>', app_xml) if len(Slides) > 0: Slides = Slides[0].split('>')[1].split('<')[0] else: Slides = '#' PresentationFormat = re.findall( r'<PresentationFormat>.*</PresentationFormat>', app_xml) if len(PresentationFormat) > 0: PresentationFormat = PresentationFormat[0].split('>')[1].split( '<')[0] else: PresentationFormat = '#' res = { 'Fichero': fichero, 'Tipo': ext, 'creator': creator, 'lastModifiedBy': lastModifiedBy, 'created': created, 'modified': modified, 'title': title, 'revision': revision, 'lastPrinted': lastPrinted, 'keywords': keywords, 'Application': Aplicacion, 'Paginas': Paginas, 'Palabras': Palabras, 'Caracteres': Caracteres, 'Lineas': Lineas, 'Parrafos': Parrafos, 'Slides': Slides, 'PresentationFormat': PresentationFormat } return res
Write a script "cover_the_emperor.py" that appends the chapter 8 practice file named "The Emperor.pdf" to the end of the chapter 8 practice file named "Emperor cover sheet.pdf" and outputs the full resulting PDF to the file "The Covered Emperor.pdf" in the chapter 8 practice files Output folder. Obviously we need required PDF files to work with. They can be found in book materials. ''' import os from pyPdf import PdfFileReader, PdfFileWriter path = "/Users/srg/practice_files" inputFileName = os.path.join(path, "The Emperor.pdf") inputFile = PdfFileReader(file(inputFileName, "rb")) coverFileName = os.path.join(path, "Emperor cover sheet.pdf") coverFile = PdfFileReader(file(coverFileName, "rb")) outputPDF = PdfFileWriter() for pageNum in xrange(0, inputFile.getNumPages()): page = inputFile.getPage(pageNum) outputPDF.addPage(page) outputPDF.addPage(coverFile.getPage(0)) outputFileName = os.path.join(path, "Output/The Covered Emperor.pdf") outputFile = file(outputFileName, "wb") outputPDF.write(outputFile) outputFile.close()
def update_file_info(self, file): # set defaults to blank file.add_string_attribute('title', '') file.add_string_attribute('album', '') file.add_string_attribute('artist', '') file.add_string_attribute('tracknumber', '') file.add_string_attribute('genre', '') file.add_string_attribute('date', '') file.add_string_attribute('bitrate', '') file.add_string_attribute('samplerate', '') file.add_string_attribute('length', '') file.add_string_attribute('exif_datetime_original', '') file.add_string_attribute('exif_software', '') file.add_string_attribute('exif_flash', '') file.add_string_attribute('exif_pixeldimensions', '') file.add_string_attribute('pixeldimensions', '') if file.get_uri_scheme() != 'file': return # strip file:// to get absolute path filename = urllib.unquote(file.get_uri()[7:]) # mp3 handling if file.is_mime_type('audio/mpeg'): # attempt to read ID3 tag try: audio = EasyID3(filename) # sometimes the audio variable will not have one of these items defined, that's why # there is this long try / except attempt try: file.add_string_attribute('title', audio["title"][0]) except: file.add_string_attribute('title', "[n/a]") try: file.add_string_attribute('album', audio["album"][0]) except: file.add_string_attribute('album', "[n/a]") try: file.add_string_attribute('artist', audio["artist"][0]) except: file.add_string_attribute('artist', "[n/a]") try: file.add_string_attribute('tracknumber', audio["tracknumber"][0]) except: file.add_string_attribute('tracknumber', "[n/a]") try: file.add_string_attribute('genre', audio["genre"][0]) except: file.add_string_attribute('genre', "[n/a]") try: file.add_string_attribute('date', audio["date"][0]) except: file.add_string_attribute('date', "[n/a]") except: # [SabreWolfy] some files have no ID3 tag and will throw this exception: file.add_string_attribute('title', "[no ID3]") file.add_string_attribute('album', "[no ID3]") file.add_string_attribute('artist', "[no ID3]") file.add_string_attribute('tracknumber', "[no ID3]") file.add_string_attribute('genre', "[no ID3]") file.add_string_attribute('date', "[no ID3]") # try to read MP3 information (bitrate, length, samplerate) try: mpfile = open (filename) mpinfo = MPEGInfo (mpfile) file.add_string_attribute('bitrate', str(mpinfo.bitrate/1000) + " Kbps") file.add_string_attribute('samplerate', str(mpinfo.sample_rate) + " Hz") # [SabreWolfy] added consistent formatting of times in format hh:mm:ss # [SabreWolfy[ to allow for correct column sorting by length mp3length = "%02i:%02i:%02i" % ((int(mpinfo.length/3600)), (int(mpinfo.length/60%60)), (int(mpinfo.length%60))) mpfile.close() file.add_string_attribute('length', mp3length) except: file.add_string_attribute('bitrate', "[n/a]") file.add_string_attribute('length', "[n/a]") file.add_string_attribute('samplerate', "[n/a]") try: mpfile.close() except: pass # image handling if file.is_mime_type('image/jpeg') or file.is_mime_type('image/png') or file.is_mime_type('image/gif') or file.is_mime_type('image/bmp'): # EXIF handling routines try: metadata = pyexiv2.ImageMetadata(filename) metadata.read() try: exif_datetimeoriginal = metadata['Exif.Photo.DateTimeOriginal'] file.add_string_attribute('exif_datetime_original',str(exif_datetimeoriginal.raw_value)) except: file.add_string_attribute('exif_datetime_original',"") try: exif_imagesoftware = metadata['Exif.Image.Software'] file.add_string_attribute('exif_software',str(exif_imagesoftware.raw_value)) except: file.add_string_attribute('exif_software',"") try: exif_photoflash = metadata['Exif.Photo.Flash'] file.add_string_attribute('exif_flash',str(exif_photoflash.raw_value)) except: file.add_string_attribute('exif_flash',"") try: exif_pixelydimension = metadata['Exif.Photo.PixelYDimension'] exif_pixelxdimension = metadata['Exif.Photo.PixelXDimension'] file.add_string_attribute('exif_pixeldimensions',str(exif_pixelydimension.raw_value)+'x'+str(exif_pixelxdimension.raw_value)) except: file.add_string_attribute('exif_pixeldimensions',"") except: # no exif data? file.add_string_attribute('exif_datetime_original',"") file.add_string_attribute('exif_software',"") file.add_string_attribute('exif_flash',"") file.add_string_attribute('exif_pixeldimensions',"") # try read image info directly try: im = Image.open(filename) file.add_string_attribute('pixeldimensions',str(im.size[0])+'x'+str(im.size[1])) except: file.add_string_attribute('pixeldimensions',"[image read error]") # video/flac handling if file.is_mime_type('video/x-msvideo') | file.is_mime_type('video/mpeg') | file.is_mime_type('video/x-ms-wmv') | file.is_mime_type('video/mp4') | file.is_mime_type('audio/x-flac') | file.is_mime_type('video/x-flv') | file.is_mime_type('video/x-matroska') | file.is_mime_type('audio/x-wav'): try: info=kaa.metadata.parse(filename) try: file.add_string_attribute('length',"%02i:%02i:%02i" % ((int(info.length/3600)), (int(info.length/60%60)), (int(info.length%60)))) except: file.add_string_attribute('length','[n/a]') try: file.add_string_attribute('pixeldimensions', str(info.video[0].width) + 'x'+ str(info.video[0].height)) except: file.add_string_attribute('pixeldimensions','[n/a]') try: file.add_string_attribute('bitrate',str(round(info.audio[0].bitrate/1000))) except: file.add_string_attribute('bitrate','[n/a]') try: file.add_string_attribute('samplerate',str(int(info.audio[0].samplerate))+' Hz') except: file.add_string_attribute('samplerate','[n/a]') try: file.add_string_attribute('title', info.title) except: file.add_string_attribute('title', '[n/a]') try: file.add_string_attribute('artist', info.artist) except: file.add_string_attribute('artist', '[n/a]') try: file.add_string_attribute('genre', info.genre) except: file.add_string_attribute('genre', '[n/a]') try: file.add_string_attribute('tracknumber',info.trackno) except: file.add_string_attribute('tracknumber', '[n/a]') try: file.add_string_attribute('date',info.userdate) except: file.add_string_attribute('date', '[n/a]') try: file.add_string_attribute('album',info.album) except: file.add_string_attribute('album', '[n/a]') except: file.add_string_attribute('length','error') file.add_string_attribute('pixeldimensions','error') file.add_string_attribute('bitrate','error') file.add_string_attribute('samplerate','error') file.add_string_attribute('title','error') file.add_string_attribute('artist','error') file.add_string_attribute('genre','error') file.add_string_attribute('track','error') file.add_string_attribute('date','error') file.add_string_attribute('album','error') # pdf handling if file.is_mime_type('application/pdf'): try: f = open(filename, "rb") pdf = PdfFileReader(f) try: file.add_string_attribute('title', pdf.getDocumentInfo().title) except: file.add_string_attribute('title', "[n/a]") try: file.add_string_attribute('artist', pdf.getDocumentInfo().author) except: file.add_string_attribute('artist', "[n/a]") f.close() except: file.add_string_attribute('title', "[no info]") file.add_string_attribute('artist', "[no info]") self.get_columns()
## print inch ## can.drawString(0.3*inch, -inch, "Hello World") #change cage code cage = '55910' can.drawString(450*mult, start, "ECPVG2") can.drawString(450*mult, (start - 15*mult), "CAGE: " + cage) can.save() #move to the beginning of the StringIO buffer packet.seek(0) name = 'Dave' can.beginForm(name, lowerx=0, lowery=0, upperx=None, uppery=None) can.endForm() new_pdf = PdfFileReader(packet) # read your existing PDF fname = 'McMaster-Carr_Source_files\\' + i existing_pdf = PdfFileReader(file(fname, "rb")) output = PdfFileWriter() # add the "watermark" (which is the new pdf) on the existing page nump = existing_pdf.getNumPages() page = existing_pdf.getPage(0) for l in range(nump): output.addPage(existing_pdf.getPage(l)) page.mergePage(new_pdf.getPage(0)) # finally, write "output" to a real file outputStream = file(a[0]+"_"+b, "wb") output.write(outputStream) outputStream.close()
# 11.1 review exercises import os from pyPdf import PdfFileReader, PdfFileWriter path = "C:/Real Python/refactor/chp12/practice_files" input_file_name = os.path.join(path, "The Whistling Gypsy.pdf") input_file = PdfFileReader(open(input_file_name, "rb")) # Display meta-data about file print("Title:", input_file.getDocumentInfo().title) print("Author:", input_file.getDocumentInfo().author) print("Number of pages:", input_file.getNumPages()) # Specify and open output text file output_file_name = os.path.join(path, "Output/The Whistling Gypsy.txt") with open(output_file_name, "w") as output_file: # Extract every page of text for page_num in range(0, input_file.getNumPages()): text = input_file.getPage(page_num).extractText() text = text.encode("utf-8") # convert text to unicode output_file.write(text) # Save file without cover page output_PDF = PdfFileWriter() for page_num in range(1, input_file.getNumPages()): output_PDF.addPage(input_file.getPage(page_num)) output_file_name = os.path.join(path, "Output/The Whistling Gypsy un-covered.pdf") with open(output_file_name, "wb") as output_file:
def create_source_pdf(self, cr, uid, ids, data, report_xml, context=None): if not context: context={} pool = pooler.get_pool(cr.dbname) pool_attach = pool.get('ir.attachment') picking_obj = pool.get('stock.picking.out') myflag = False # if data['model'] == 'stock.picking.out': if context['active_model'] == 'stock.picking.out': if report_xml.name in ['Bill of Lading', 'Master Bill of Lading']: myflag = True attach = report_xml.attachment singleton = False MBOL = [] M_attach = False if attach: objs = self.getObjects(cr, uid, ids, context) results = [] for obj in objs: aname = eval(attach, {'object':obj, 'time':time}) result = False if not myflag: if report_xml.attachment_use and aname and context.get('attachment_use', True): aids = pool_attach.search(cr, uid, [('datas_fname','=',aname+'.pdf'),('res_model','=',self.table),('res_id','=',obj.id)]) if aids: brow_rec = pool_attach.browse(cr, uid, aids[0]) if not brow_rec.datas: continue d = base64.decodestring(brow_rec.datas) results.append((d,'pdf')) continue if myflag and report_xml.name == 'Master Bill of Lading': data.update({'objects' : objs}) if not MBOL: result = self.create_single_pdf(cr, uid, [obj.id], data, report_xml, context) else: result = MBOL # if data['model'] == 'stock.picking.out' and report_xml.name == 'Master Bill of Lading': if context['active_model'] == 'stock.picking.out' and report_xml.name == 'Master Bill of Lading': MBOL = result singleton = True if not result: return False if aname: try: name = aname+'.'+result[1] if myflag: if report_xml.name == 'Master Bill of Lading': att_id = picking_obj.browse(cr, uid, obj.id).attached_mbol_report_id.id if att_id: pool_attach.unlink(cr, uid, [att_id]) aname = 'Master BOL-'+ time.strftime('%Y-%m-%d %H:%M:%S') else: #unlink the previous attached BOL report att_id = picking_obj.browse(cr, uid, obj.id).attached_report_id.id if att_id: pool_attach.unlink(cr, uid, [att_id]) aname = 'BOL-'+ time.strftime('%Y-%m-%d %H:%M:%S') if not M_attach: new_attach = pool_attach.create(cr, uid, { 'name': aname, 'datas': base64.encodestring(result[0]), 'datas_fname': name, 'res_model': self.table, 'res_id': obj.id, }, context=context ) else: new_attach = M_attach if myflag: # Create new attachment of BOL report new_val = {'attached_report_id':new_attach} if report_xml.name == 'Master Bill of Lading': M_attach = new_attach new_val = {'attached_mbol_report_id':new_attach} picking_obj.write(cr, uid, obj.id, new_val) except Exception: #TODO: should probably raise a proper osv_except instead, shouldn't we? see LP bug #325632 logging.getLogger('report').error('Could not create saved report attachment', exc_info=True) if not MBOL: results.append(result) if results: if results[0][1]=='pdf': from pyPdf import PdfFileWriter, PdfFileReader output = PdfFileWriter() for r in results: reader = PdfFileReader(cStringIO.StringIO(r[0])) for page in range(reader.getNumPages()): output.addPage(reader.getPage(page)) s = cStringIO.StringIO() output.write(s) return s.getvalue(), results[0][1] return self.create_single_pdf(cr, uid, ids, data, report_xml, context)
from pyPdf import PdfFileReader, PdfFileWriter from pyPdf.generic import NameObject, createStringObject inpfn = raw_input('Enter PDF path : ') fin = file(inpfn, 'rb') pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo for key in info: infoDict.update({NameObject(key): createStringObject(info[key])}) # add the grade list_of_data_to_delete = [ '/CreationDate', '/Author', '/Creator', '/ModDate', '/Producer', '/Title' ] for item in list_of_data_to_delete: try: infoDict.update({NameObject(item): createStringObject(u'')}) except: print("can't delete : ", i) fout = open('outputFile.pdf', 'wb')
import os from pyPdf import PdfFileReader path = "/Users/KevinKoshy/PycharmProjects/RealPythonEg" input_file_name = os.path.join(path, "Pride and Prejudice.pdf") input_file = PdfFileReader(file(input_file_name, "rb")) print "Number of pages = ", input_file.getNumPages() print "Title = ", input_file.getDocumentInfo().title
import os from pyPdf import PdfFileReader my_path = "D:/Training/Python-Learning/realpython-webster/Course1/Practice files_12" input_file_name = os.path.join(my_path, "half and half.pdf") input_file = PdfFileReader(file(input_file_name, "rb")) page = input_file.getPage(0) print page.mediaBox
from pyPdf import PdfFileReader import re from Question import Question if __name__ == '__main__': pdffile = PdfFileReader(file("SY0-301.pdf", "r")) all_text = '' for page in pdffile.pages: text = page.extractText().strip() text = text.replace('Explanation:', '') text = text.replace( 'CompTIA SY0-301 Exam"Pass Any Exam. Any Time." - ' + 'www.actualtests.com', '') text = text.replace('CompTIA SY0-301', '') text = text.replace('CompTIA Security+Version: 15.0', '') text = text.replace(' ', ' ') all_text += ' ' + text qp = re.compile(r'QUESTION\sNO:\s\d+.*?Answer:\s\w,?\w?') pp = re.compile(r'QUESTION\sNO:\s(\d+)(.*?)(A[.].*?)Answer:\s(\w,?\w?)') q_list = re.findall(qp, all_text) questions = [] for q in q_list: parts = re.match(pp, q) question = Question() question.number = int(parts.group(1)) question.question = parts.group(2).strip() question.correct_answer = parts.group(4).strip() letters = ['A', 'B', 'C', 'D', 'E', 'F']
text = text.replace(r, u" ") text = text.replace(u"\ufb01", u"fi") return text files = os.listdir(papers_dir) files = [x for x in files if x[-3:] == 'pdf'] titles = [] authors = [] keywords = [] all_papers = [] for i, f in enumerate(files): reader = PdfFileReader(open(papers_dir + f, 'rb')) info = dict(reader.documentInfo) title = info['/Title'].encode('ascii', 'ignore') author = info['/Author'].encode('ascii', 'ignore') keyword = info['/Keywords'].encode('ascii', 'ignore') # titles += [info['/Title']] # authors += [info['/Author']] # keywords += [info['/Keywords']] all_papers += [ Paper(id_=random.randint(0, 999999999), title=title, authors=author, filename=f, keywords=keyword) ]
# call_log(Request_id,status,call_log_bat) call_log_new(Request_id, Process_Type, Process_Head, Process_Name, Process_Time, Current_Status, Time_Stamp) pdf_list = [x.replace('\n', '') for x in pdfinput] bat_xlsx = listpath_new.replace( 'Internal Omni Request', 'Internal Process Sql Request') if os.path.isfile(bat_xlsx): os.remove(bat_xlsx) for pdf in pdf_list: input1 = PdfFileReader(open(pdf, "rb")) input1.getPage(0).mediaBox pxcel_files = list(input1.getPage(0).mediaBox) if int(pxcel_files[2]) > 2015 or int( pxcel_files[3]) > 2015: x = float(1000 / float(pxcel_files[3])) print x # print input1.getPage(0).scale(0.5, 0.5) print input1.getPage(0).scaleBy(.3) output = PdfFileWriter() print input1.getPage(0).mediaBox file_folder = pdf.split('\\') p_name = file_folder[-1] omni_filesave = omni_savedir + '\\' + p_name output.addPage(input1.getPage(0))
import sys if __name__ == '__main__' and len( sys.argv) > 0 and sys.argv[1][-3:].upper() == 'PDF': from pyPdf import PdfFileWriter, PdfFileReader print len(sys.argv) #inp[] total = len(sys.argv) original = sys.argv[1] target = 'Combine.pdf' inp = PdfFileReader(file(sys.argv[1], "rb")) page = inp.getPage(0) output = PdfFileWriter() for i in range(1, total): inp = PdfFileReader(file(sys.argv[i], "rb")) page.mergePage(inp.getPage(0)) #numPages = input1.getNumPages() # print the title of document1.pdf # print "title = %s" % (input1.getDocumentInfo().title) #for i in total: # page.mergePage(inp[i].getPage(1)) output.addPage(page) outputStream = file(target, "wb") output.write(outputStream) outputStream.close() print "DONE !"
def post(self): name = self.request.get('name') sendmail('*****@*****.**', '*****@*****.**', name) if name: # if not id: # self.error(500) # return try: q = db.Query(Priestessess) q = Priestessess.all() q.filter('name =', name) e = q.get() cert_name = e.name d = [dates[x] for x in dates if x == date.split('-')[1]] m = [months[x] for x in months if x == date.split('-')[0]] y = [years[x] for x in years if x == date.split('-')[2]] self.response.write(str(m) + str(d)) date_str = " In witness whereof we have placed our name on this," date_str2 = "the {0} day of {1}, in the year Two Thousand {2}." \ .format(str(d).split("'")[1], str(m).split("'")[1], str(y).split("'")[1]) packet = StringIO.StringIO() # create a new PDF with Reportlab can = canvas.Canvas(packet, pagesize=letter) text = can.beginText() text2 = can.beginText() text.setTextOrigin(10.3 * cm, 10.8 * cm) text2.setTextOrigin(8.8 * cm, 7.7 * cm) text2.setFont("Tangerine_Bold", 20) text.setFont('VeraBd', 20) text.textLine(cert_name.upper()) text2.textLine(date_str) text2.textLine(date_str2) can.drawText(text) can.drawText(text2) # can.drawString(300, 310, name) can.save() #move to the beginning of the StringIO buffer packet.seek(0) new_pdf = PdfFileReader(packet) # read your existing PDF existing_pdf = PdfFileReader( file("AmericanPriestessCert.pdf", "rb")) output = PdfFileWriter() # add the "watermark" (which is the new pdf) on the existing page page = existing_pdf.getPage(0) page.mergePage(new_pdf.getPage(0)) output.addPage(page) outputstream = StringIO.StringIO() output.write(outputstream) self.response.headers['Content-Type'] = 'application/pdf' self.response.headers[ 'Content-Disposition'] = 'attachment; filename=AmericanPriestessCert.pdf' self.response.headers['Content-Transfer-Encoding'] = 'binary' self.response.out.write(outputstream.getvalue()) except: error = """There was an error.\n Please either re-enter your name \ exactly as you typed it in field one of the application form, or check \ your email to ensure payment has been successful.""" self.render('pdf.html', error=error) else: error = "Please enter you full name, as it appears on your certificate." self.render('pdf.html', error=error)
# 8.2 cover_the_emperor.py # Add a cover sheet to a PDF; save the full output as a new PDF import os from pyPdf import PdfFileReader, PdfFileWriter path = "C:/Real Python/Course materials/Chapter 8/Practice files" inputFileName1 = os.path.join(path, "Emperor cover sheet.pdf") inputFile1 = PdfFileReader(file(inputFileName1, "rb")) inputFileName2 = os.path.join(path, "The Emperor.pdf") inputFile2 = PdfFileReader(file(inputFileName2, "rb")) outputPDF = PdfFileWriter() # Read in all pages from the cover sheet PDF file for pageNum in range(0, inputFile1.getNumPages()): page = inputFile1.getPage(pageNum) outputPDF.addPage(page) # Read in all pages from "The Emperor.pdf" into the same output file for pageNum in range(0, inputFile2.getNumPages()): page = inputFile2.getPage(pageNum) outputPDF.addPage(page) # Output the results into a new PDF outputFileName = os.path.join(path, "Output/The Covered Emperor.pdf") outputFile = file(outputFileName, "wb") outputPDF.write(outputFile) outputFile.close()
def do_update_file_info(self, file): info = FileExtensionInfo() # strip file:// to get absolute path filename = urllib.unquote(file.get_uri()[7:]) # mp3 handling if file.is_mime_type('audio/mpeg'): # attempt to read ID3 tag try: audio = EasyID3(filename) # sometimes the audio variable will not have one of these items defined, that's why # there is this long try / except attempt try: info.title = audio["title"][0] except: pass try: info.album = audio["album"][0] except: pass try: info.artist = audio["artist"][0] except: pass try: info.tracknumber = "{:0>2}".format(audio["tracknumber"][0]) except: pass try: info.genre = audio["genre"][0] except: pass try: info.date = audio["date"][0] except: pass except: pass # try to read MP3 information (bitrate, length, samplerate) try: mpfile = open(filename) mpinfo = MPEGInfo(mpfile) info.bitrate = str(mpinfo.bitrate / 1000) + " Kbps" info.samplerate = str(mpinfo.sample_rate) + " Hz" # [SabreWolfy] added consistent formatting of times in format hh:mm:ss # [SabreWolfy[ to allow for correct column sorting by length mp3length = "%02i:%02i:%02i" % ((int(mpinfo.length / 3600)), (int(mpinfo.length / 60 % 60)), (int(mpinfo.length % 60))) mpfile.close() info.length = mp3length except: try: mpfile.close() except: pass # image handling elif file.is_mime_type('image/jpeg') or file.is_mime_type( 'image/png') or file.is_mime_type( 'image/gif') or file.is_mime_type('image/bmp'): # EXIF handling routines try: metadata = pyexiv2.ImageMetadata(filename) metadata.read() try: exif_datetimeoriginal = metadata[ 'Exif.Photo.DateTimeOriginal'] info.exif_datetime_original = str( exif_datetimeoriginal.raw_value) except: pass try: exif_imagesoftware = metadata['Exif.Image.Software'] info.exif_software = str(exif_imagesoftware.raw_value) except: pass try: exif_photoflash = metadata['Exif.Photo.Flash'] info.exif_flash = str(exif_photoflash.raw_value) except: pass try: exif_rating = metadata['Xmp.xmp.Rating'] info.exif_rating = str(exif_rating.raw_value) except: pass except: pass # try read image info directly try: im = PIL.Image.open(filename) info.pixeldimensions = str(im.size[0]) + 'x' + str(im.size[1]) except error as e: print e pass # video/flac handling elif file.is_mime_type('video/x-msvideo') | file.is_mime_type( 'video/mpeg' ) | file.is_mime_type('video/x-ms-wmv') | file.is_mime_type( 'video/mp4' ) | file.is_mime_type('audio/x-flac') | file.is_mime_type( 'video/x-flv') | file.is_mime_type( 'video/x-matroska') | file.is_mime_type('audio/x-wav'): try: metadata = kaa.metadata.parse(filename) try: info.length = "%02i:%02i:%02i" % ( (int(metadata.length / 3600)), (int(metadata.length / 60 % 60)), (int(metadata.length % 60))) except: pass try: info.pixeldimensions = str( metadata.video[0].width) + 'x' + str( metadata.video[0].height) except: pass try: info.bitrate = str(round(metadata.audio[0].bitrate / 1000)) except: pass try: info.samplerate = str(int( metadata.audio[0].samplerate)) + ' Hz' except: pass try: info.title = metadata.title except: pass try: info.artist = metadata.artist except: pass try: info.genre = metadata.genre except: pass try: info.tracknumber = metadata.trackno except: pass try: info.date = metadata.userdate except: pass try: info.album = metadata.album except: pass except: pass # pdf handling elif file.is_mime_type('application/pdf'): try: f = open(filename, "rb") pdf = PdfFileReader(f) try: info.title = pdf.getDocumentInfo().title except: pass try: info.artist = pdf.getDocumentInfo().author except: pass f.close() except: pass self.set_file_attributes(file, info) del info
print "Output file must be a PDF." margin = {"l": 0, "t": 0, "r": 0, "b": 0} for a in opts[:]: if a[0] == '-m' or a[0] == '--margin': if a[1] != None: m_temp = a[1].strip("\"").split() margin["l"] = float(m_temp[0]) margin["t"] = float(m_temp[1]) margin["r"] = float(m_temp[2]) margin["b"] = float(m_temp[3]) else: print "Error" input1 = PdfFileReader(file(input_file, "rb")) output = PdfFileWriter() outputstream = file(output_file, "wb") pages = input1.getNumPages() top_right = { 'x': input1.getPage(0).mediaBox.getUpperRight_x(), 'y': input1.getPage(0).mediaBox.getUpperRight_y() } top_left = { 'x': input1.getPage(0).mediaBox.getUpperLeft_x(), 'y': input1.getPage(0).mediaBox.getUpperLeft_y() } bottom_right = {
# 8.2 review exercises import os import copy from pyPdf import PdfFileReader, PdfFileWriter path = "C:/Real Python/Course materials/Chapter 8/Practice files" inputFileName = os.path.join(path, "Walrus.pdf") inputFile = PdfFileReader(file(inputFileName, "rb")) outputPDF = PdfFileWriter() inputFile.decrypt("IamtheWalrus") # decrypt the input file for pageNum in range(0, inputFile.getNumPages()): # rotate pages (call everything pageLeft for now; will make a copy) pageLeft = inputFile.getPage(pageNum) pageLeft.rotateCounterClockwise(90) pageRight = copy.copy(pageLeft) # split each page in half upperRight = pageLeft.mediaBox.upperRight # get original page corner # crop and add left-side page pageLeft.mediaBox.upperRight = (upperRight[0]/2, upperRight[1]) outputPDF.addPage(pageLeft) # crop and add right-side page pageRight.mediaBox.upperLeft = (upperRight[0]/2, upperRight[1]) outputPDF.addPage(pageRight) # save new pages to an output file outputFileName = os.path.join(path, "Output/Updated Walrus.pdf") with file(outputFileName, "wb") as outputFile:
def clean(self, *args, **kwargs): try: PdfFileReader(self.pdf) except Exception as e: raise ValidationError(_('Разрешен только PDF файл'))
'''THIS IS A PYTHON 2 CODE''' import pyPdf from pyPdf import PdfFileReader file = PdfFileReader( open('path\to\file.pdf', 'rb') ) # First open the file and then pass the object as an args to the PdfFileReader info = file.getDocumentInfo() # Returns a dictionary for meta_item in info: print "{} Info: {}".format(meta_item, info[meta_item])
# Below is to add on the weblink as text and present date&time on PDF generated outputPDF = PdfFileWriter() packet = StringIO.StringIO() # create a new PDF with Reportlab can = canvas.Canvas(packet, pagesize=letter) can.setFont("Helvetica", 9) # Writting the new line oknow = time.strftime("%a, %d %b %Y %H:%M") can.drawString(5, 2, url) can.drawString(605, 2, oknow) can.save() #move to the beginning of the StringIO buffer packet.seek(0) new_pdf = PdfFileReader(packet) # read your existing PDF existing_pdf = PdfFileReader(file(tem_pdf, "rb")) pages = existing_pdf.getNumPages() output = PdfFileWriter() # add the "watermark" (which is the new pdf) on the existing page for x in range(0, pages): page = existing_pdf.getPage(x) page.mergePage(new_pdf.getPage(0)) output.addPage(page) # finally, write "output" to a real file outputStream = file(final_file, "wb") output.write(outputStream) outputStream.close() print(final_file, 'is ready.')
import StringIO from reportlab.pdfgen import canvas # To register a specific font # from reportlab.pdfbase import pdfmetrics # from reportlab.pdfbase.ttfonts import TTFont # pdfmetrics.registerFont(TTFont('Allura', 'Allura.ttf')) for line in open('list.csv'): packet = StringIO.StringIO() # create a new PDF with Reportlab can = canvas.Canvas(packet, (864, 608.9)) can.setFillColorRGB(0, 0, 100 / 256) #can.setFont("Allura", 40) can.drawCentredString(432, 240, line) can.save() #move to the beginning of the StringIO buffer packet.seek(0) new_pdf = PdfFileReader(packet) # read your existing PDF existing_pdf = PdfFileReader(file("original.pdf", "rb")) output = PdfFileWriter() page = existing_pdf.getPage(0) page.mergePage(new_pdf.getPage(0)) output.addPage(page) # finally, write "output" to a real file outputStream = file(line[:-1] + ".pdf", "wb") output.write(outputStream) outputStream.close()
from pyPdf import PdfFileWriter, PdfFileReader output = PdfFileWriter() input1 = PdfFileReader(file("CONSTITUCION-Interiores.pdf", "rb")) # print the title of document1.pdf print "title = %s" % (input1.getDocumentInfo()) print "title = %s" % (input1.getPage(0).extractText()) for i in range(5): print "title = %s" % (input1.getPage(i).extractText())
from sys import argv from pyPdf import PdfFileReader from os import path filename = argv[1] document = PdfFileReader(file(filename, "rb")) pages = document.getNumPages() with open(filename+".info", 'w') as out: path = path.dirname(filename) if path: path = path + '/' out.write("""import json def UpdateInfo(): global FileName, FileList, PageCount global DocumentTitle global Pcurrent, Pnext, Tcurrent, Tnext, InitialPage global RTrunning, RTrestart, StartTime, PageEnterTime, CurrentTime with open('"""+path+"""json.txt', 'w') as io: json.dump(({"page_count": PageCount, "current_page": Pcurrent, "previous_page": Pnext, "start_time": StartTime, "pageenter_time": PageEnterTime, "current_time": CurrentTime, "notes": PageProps[Pcurrent]['notes']}), io) PageProps = { """) for i in range(1,pages + 1): if i < pages: out.write(" "+str(i)+": {\n 'transition': None,\n 'overview': True,\n 'notes': '',\n 'OnEnter': UpdateInfo\n },\n") else:
def export_to_file(self, file_out, only_selected=False): """Export to file""" selection = self.iconview.get_selected_items() pdf_output = PdfFileWriter() pdf_input = [] for pdfdoc in self.pdfqueue: pdfdoc_inp = PdfFileReader(file(pdfdoc.copyname, 'rb')) if pdfdoc_inp.getIsEncrypted(): try: # Workaround for lp:#355479 stat = pdfdoc_inp.decrypt('') except: stat = 0 if (stat != 1): errmsg = _( 'File %s is encrypted.\n' 'Support for encrypted files has not been implemented yet.\n' 'File export failed.') % pdfdoc.filename raise Exception, errmsg #FIXME #else # ask for password and decrypt file pdf_input.append(pdfdoc_inp) for row in self.model: if only_selected and row.path not in selection: continue # add pages from input to output document nfile = row[2] npage = row[3] current_page = copy(pdf_input[nfile - 1].getPage(npage - 1)) angle = row[6] angle0 = current_page.get("/Rotate", 0) crop = [row[7], row[8], row[9], row[10]] if angle != 0: current_page.rotateClockwise(angle) if crop != [0., 0., 0., 0.]: rotate_times = (((angle + angle0) % 360 + 45) / 90) % 4 crop_init = crop if rotate_times != 0: perm = [0, 2, 1, 3] for it in range(rotate_times): perm.append(perm.pop(0)) perm.insert(1, perm.pop(2)) crop = [crop_init[perm[side]] for side in range(4)] #(x1, y1) = current_page.cropBox.lowerLeft #(x2, y2) = current_page.cropBox.upperRight (x1, y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft] (x2, y2) = [float(xy) for xy in current_page.mediaBox.upperRight] x1_new = int(x1 + (x2 - x1) * crop[0]) x2_new = int(x2 - (x2 - x1) * crop[1]) y1_new = int(y1 + (y2 - y1) * crop[3]) y2_new = int(y2 - (y2 - y1) * crop[2]) #current_page.cropBox.lowerLeft = (x1_new, y1_new) #current_page.cropBox.upperRight = (x2_new, y2_new) current_page.mediaBox.lowerLeft = (x1_new, y1_new) current_page.mediaBox.upperRight = (x2_new, y2_new) pdf_output.addPage(current_page) # finally, write "output" to document-output.pdf pdf_output.write(file(file_out, 'wb'))