def get(self): url = self.request.get('url') if not url: url = "" page = self.request.get('pg') if not page: page = 'all' else: page = int(page)-1 u = urllib2.urlopen(urllib2.unquote(url)) output = StringIO.StringIO() output.write(u.read()) p = PdfFileReader(output) pages = p.getNumPages() title = p.getDocumentInfo().title if page == "all": content="" for i in range(0, pages): # Extract text from page and add to content content += "<page number='%d'><![CDATA[%s]]></page>\n" % (i+1, p.getPage(i).extractText()) else: content = "<page number='%d'><![CDATA[%s]]></page>\n" % (page+1, p.getPage(page).extractText()) output.close() result = "<?xml version='1.0' encoding='UTF-8'?>\n<document url='%s' title='%s'>%s</document>" % (url, title, content) self.response.headers['Content-type'] = 'application/xml' self.response.out.write(result)
def printMeta(filename): f = PdfFileReader(open(filename, "rb")) info = f.getDocumentInfo() print("[*] PDF Metadata for:" + filename) for item in info: print("[+]", item, ":", info[item])