def test_run(self, mock_mkdirs_exists_ok, mock_os_rename, mock_path_isdir, mock_path_isfile):
        settings = default_settings()
        settings.emop_home = "/foo"
        job = mock_emop_job(settings)
        tesseract = Tesseract(job)

        mock_path_isfile.return_value = True
        mock_path_isdir.return_value = False

        expected_cmd = [
            "tesseract", job.image_path, tesseract.output_filename,
            "-l", job.font.name, tesseract.cfg
        ]
        results = mock_results_tuple()
        expected_results = results(None, None, 0)
        self.mock_rv.communicate.return_value[0] = ""

        retval = tesseract.run()
        args, kwargs = self.mock_popen.call_args

        self.assertTrue(mock_path_isfile.called)
        self.assertTrue(mock_path_isdir.called)
        self.assertTrue(mock_mkdirs_exists_ok.called)
        self.assertTrue(self.mock_popen.called)
        self.assertEqual(expected_cmd, args[0])
        # self.assertTrue(mock_os_rename.called)
        self.assertTupleEqual(expected_results, retval)
    def test_should_run_true_all_values_missing(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        job.page_result.ocr_text_path_exists = False
        job.page_result.ocr_xml_path_exists = False
        tesseract = Tesseract(job)

        self.assertTrue(tesseract.should_run())
    def test_should_run_false(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        job.page_result.ocr_text_path_exists = True
        job.page_result.ocr_xml_path_exists = True
        tesseract = Tesseract(job)

        self.assertFalse(tesseract.should_run())
Ejemplo n.º 4
0
    def do_ocr(self, job):
        """Run the OCR

        The actual OCR class is called from here.  Based on the value
        of the ocr_engine, a different class will be called.

        The ocr_results returned by the OCR class are used to determine if
        the ocr was successful and the results are appended to global results.

        Args:
            job (EmopJob): EmopJob object

        Returns:
            bool: True if successful, False otherwise.
        """
        logger.info(
            "Got job [%s] - Batch: %s JobType: %s OCR Engine: %s" %
            (job.id, job.batch_job.name, job.batch_job.job_type, job.batch_job.ocr_engine)
        )

        # OCR #
        ocr_engine = job.batch_job.ocr_engine
        if ocr_engine == "tesseract":
            ocr = Tesseract(job=job)
        elif ocr_engine == "ocular":
            ocr = OcularTranscribe(job=job)
        else:
            ocr_engine_err = "OCR with %s not yet supported" % ocr_engine
            self.append_result(job=job, results=ocr_engine_err, failed=True)
            return False

        if self.settings.controller_skip_existing and not ocr.should_run():
            logger.info("Skipping OCR job [%s]" % job.id)
            return True
        ocr_result = ocr.run()

        if ocr_result.exitcode != 0:
            ocr_err = "%s OCR Failed: %s" % (ocr_engine, ocr_result.stderr)
            self.append_result(job=job, results=ocr_err, failed=True)
            return False
        else:
            return True