Example #1
0
 def extractTextFromStructuredDoc(self):
     """
     creates "text" dir to dump the extracted pages
     """
     self.logger.info('Calling Pdftotext: Dumping text pages at %s', os.path.join(self.outputDir,'text'))
     pdfToText = PdfToText(self.filePath, self.totalPages, os.path.join(self.outputDir,'text'))
     pdfToText.extractPages()
Example #2
0
 def testStructuredPdfAllPagewise(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"3.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"4.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"5.txt")))
 def testStructuredPdfAllPagewise(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "3.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "4.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "5.txt")))