def get(self): url = self.request.get('url') if not url: url = "" page = self.request.get('pg') if not page: page = 'all' else: page = int(page)-1 u = urllib2.urlopen(urllib2.unquote(url)) output = StringIO.StringIO() output.write(u.read()) p = PdfFileReader(output) pages = p.getNumPages() title = p.getDocumentInfo().title if page == "all": content="" for i in range(0, pages): # Extract text from page and add to content content += "<page number='%d'><![CDATA[%s]]></page>\n" % (i+1, p.getPage(i).extractText()) else: content = "<page number='%d'><![CDATA[%s]]></page>\n" % (page+1, p.getPage(page).extractText()) output.close() result = "<?xml version='1.0' encoding='UTF-8'?>\n<document url='%s' title='%s'>%s</document>" % (url, title, content) self.response.headers['Content-type'] = 'application/xml' self.response.out.write(result)
def parse(self, file_full, statdata): pdf = PdfFileReader(file(file_full, 'rb')) pages = pdf.getNumPages() text = '' self._extra['pages'] = pages for pagenr in range(pages): page = pdf.getPage(pagenr-1) text += ' ' + page.extractText() return text
def mergePdf(self): # self.threadPdfWritingStatus.emit( # '<font size=4><b>Method "%s": </b></font><b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % ( # self.groupType, self.url)) self.threadPdfWritingStatus.emit( '<b>Setting Title for</b> %s<b>. Please Wait...</b><br />' % self.url) packet = StringIO() # create a new PDF with Reportlab pdfCanvas = canvas.Canvas(packet, pagesize=A4) pdfCanvas.setFont('Helvetica', 8) if len(self.title) is 0: self.title = str(self.url).split('/')[-1] self.title = self.regex.getSearchedData('(?i)([a-zA-Z0-9-_ ]*?)\.[a-zA-Z0-9_]*$', self.title) self.title = self.regex.replaceData('(?i)_', ' ', self.title) title = unicode(self.title[:57] + '...') if (len(self.title) > 60) else unicode(self.title) url = self.url[:57] + '...' if (len(self.title) > 60) else self.url pdfCanvas.drawString(5, 830, title + ' ' + str(url).lower()) d = datetime.datetime.now() strDate = str(d.strftime("%Y-%m-%d %H-%M-%S %p")) pdfCanvas.drawString(420, 5, 'Created Date Time: ' + strDate) pdfCanvas.save() packet.seek(0) newPdf = PdfFileReader(packet) if not os.path.exists(self.tempPdfFile): return self.printWebHtmlToPdf(self.url, self.filePath, self.fileName) writer = PdfFileWriter() tmpPdfFile = file(self.tempPdfFile, 'rb') reader = PdfFileReader(tmpPdfFile) for i in range(0, (reader.getNumPages())): page = reader.getPage(i) page.mergePage(newPdf.getPage(0)) # page = newPdf.getPage(0) # page.mergePage(reader.getPage(i)) writer.addPage(page) print 'Filename: ' + self.fileName outputStream = file(self.filePath + self.fileName, "wb") writer.write(outputStream) outputStream.close() tmpPdfFile.close() os.remove(str(self.tempPdfFile))
def extractTextFromPdfStream(stream): reader = PdfFileReader(stream) return '\n'.join( reader.getPage(i).extractText() for i in range(reader.getNumPages()))
def extract_text_from_pdf_stream(stream): reader = PdfFileReader(stream) return '\n'.join( reader.getPage(i).extractText() for i in range(reader.getNumPages()) )