def testScannedPdfPageForUnauthorisec(self):
     pdfSeparate = PdfSeparate("tests/sample-scanned-1.pdf", self.indir)
     pdfSeparate.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf")))
     try:
         abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
         abbyyPdf.setApplicationCredentials("nouser", "nopassword")
         abbyyPdf.processPdfPage(1)
     except HTTPError as e:
         self.assertEqual(e.code, 401)
         self.assertEqual(e.reason, "Unauthorized")
    def testScanned44PdfPageForNetwork(self):
        pdfSeparate = PdfSeparate('tests/sample-scanned-44pages.pdf', self.indir)
        pdfSeparate.extractPages()
        self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))

        try:
            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 44, "english")
            abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
            abbyyPdf.extractPages();
            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"44.txt")))
        except Exception:
            pass
    def testScannedPdfPage(self):
        pdfSeparate = PdfSeparate("tests/sample-scanned-1.pdf", self.indir)
        pdfSeparate.extractPages()
        self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf")))

        try:
            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english")
            abbyyPdf.setApplicationCredentials(
                self.configParser.get("abbyy", "appid"), self.configParser.get("abbyy", "password")
            )
            abbyyPdf.processPdfPage(1)
            self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))
        except Exception:
            pass
Ejemplo n.º 4
0
 def extractTextFromScannedDoc(self):
     """
     makes api calls 
     """
     self.logger.info('Calling Abbyy: OCR-ing %d pages at %s', self.totalPages, os.path.join(self.outputDir,'text'))
     abbyyPdf = AbbyyPdfTextExtractor(os.path.join(self.outputDir,'pages'), os.path.join(self.outputDir,'text'), self.totalPages, self.language)
     abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
     abbyyPdf.extractPages();
 def testScannedPdfPageForUnauthorisec(self):
     pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
     pdfSeparate.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf")))
     try:
         abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1,
                                          "english")
         abbyyPdf.setApplicationCredentials('nouser', 'nopassword')
         abbyyPdf.processPdfPage(1)
     except HTTPError as e:
         self.assertEqual(e.code, 401)
         self.assertEqual(e.reason, "Unauthorized")
    def testScannedPdfPage(self):
        pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir)
        pdfSeparate.extractPages()
        self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf")))

        try:
            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1,
                                             "english")
            abbyyPdf.setApplicationCredentials(
                self.configParser.get('abbyy', 'appid'),
                self.configParser.get('abbyy', 'password'))
            abbyyPdf.processPdfPage(1)
            self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))
        except Exception:
            pass