Ejemplo n.º 1
0
 def extractTextFromStructuredDoc(self):
     """
     creates "text" dir to dump the extracted pages
     """
     self.logger.info('Calling Pdftotext: Dumping text pages at %s', os.path.join(self.outputDir,'text'))
     pdfToText = PdfToText(self.filePath, self.totalPages, os.path.join(self.outputDir,'text'))
     pdfToText.extractPages()
Ejemplo n.º 2
0
 def testStructuredPdfAllPagewise(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "3.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "4.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "5.txt")))
Ejemplo n.º 3
0
 def testStructuredPdfAllPagewise(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"3.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"4.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"5.txt")))
Ejemplo n.º 4
0
 def processToCheckStructured(self):
     """
     dumps the entire pdf to text to get the size of the content
     """
     pdfToText = PdfToText(self.filePath, self.totalPages, self.outputDir)
     pdfToText.dumpPages()
     self.textContentSize += os.path.getsize(pdfToText.dumpedTextFilepath)
     self.logger.info('Text content size: %d bytes', self.textContentSize)
     self.logger.info('Structured? %s', self.isStructured())
Ejemplo n.º 5
0
 def testStructuredPdfAllPagesDump(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.dumpPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"sample.txt")))
Ejemplo n.º 6
0
 def testScannedPdfPage(self):
     pdfToText = PdfToText('tests/sample-scanned.pdf', 5, self.outdir)
     pdfToText.extractPage(2)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
Ejemplo n.º 7
0
 def testStructuredPdfPage(self):        
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPage(1)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
Ejemplo n.º 8
0
 def testStructuredPdfAllPagesDump(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.dumpPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,
                                                 "sample.txt")))
Ejemplo n.º 9
0
 def testScannedPdfPage(self):
     pdfToText = PdfToText('tests/sample-scanned.pdf', 5, self.outdir)
     pdfToText.extractPage(2)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
Ejemplo n.º 10
0
 def testStructuredPdfPage(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPage(1)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))