コード例 #1
0
def testrun(args, parser):
    """TESTRUN

    Reserve pages, run pages and optionally upload pages
    """
    emop_submit = EmopSubmit(args.config_path)

    # Do not run testrun subcommand if not in a valid cluster job environment
    # This prevents accidentally running resource intensive program on login nodes
    if not emop_submit.scheduler.is_job_environment():
        print("Can only use testrun subcommand from within a cluster job environment")
        sys.exit(1)

    # Reserve pages equal to --num-pages
    proc_id = emop_submit.reserve(num_pages=args.testrun_num_pages, r_filter=args.filter)
    if not proc_id:
        print("Failed to reserve pages")
        sys.exit(1)
    # Run reserved pages
    emop_run = EmopRun(args.config_path, proc_id)
    run_status = emop_run.run(force=True)
    if not run_status:
        sys.exit(1)

    # Exit if --no-upload
    if args.testrun_no_upload:
        sys.exit(0)
    # Upload results
    emop_upload = EmopUpload(args.config_path)
    upload_status = emop_upload.upload_proc_id(proc_id=proc_id)
    if not upload_status:
        sys.exit(1)

    sys.exit(0)
コード例 #2
0
 def setUp(self):
     self.popen_patcher = mock.patch("emop.lib.utilities.subprocess32.Popen")
     self.mock_popen = self.popen_patcher.start()
     self.mock_rv = mock.Mock()
     self.mock_rv.communicate.return_value = ["", ""]
     self.mock_rv.returncode = 0
     self.mock_popen.return_value = self.mock_rv
     os.environ["SLURM_JOB_ID"] = "2"
     self.run = EmopRun(config_path=default_config_path(), proc_id="0001")
コード例 #3
0
def run(args, parser):
    """run command

    The run command is intended to be executed on a compute node.  This command performs the actual work
    of OCRing pages based on the supplied --proc-id value.  By default the run command will not run if it detects the PROC_ID
    has already run, but this can be modified with --force-run.  This is useful when a batch job has been requeued.
    """
    emop_run = EmopRun(args.config_path, args.proc_id)

    # Do not use run subcommand if not in a valid cluster job environment
    # This prevents accidentally running resource intensive program on login nodes
    if not emop_run.scheduler.is_job_environment():
        print("Can only use run subcommand from within a cluster job environment")
        sys.exit(1)
    run_status = emop_run.run(force=args.force_run)
    if run_status:
        sys.exit(0)
    else:
        sys.exit(1)
コード例 #4
0
class TestEmopRun(TestCase):
    def setUp(self):
        self.popen_patcher = mock.patch("emop.lib.utilities.subprocess32.Popen")
        self.mock_popen = self.popen_patcher.start()
        self.mock_rv = mock.Mock()
        self.mock_rv.communicate.return_value = ["", ""]
        self.mock_rv.returncode = 0
        self.mock_popen.return_value = self.mock_rv
        os.environ["SLURM_JOB_ID"] = "2"
        self.run = EmopRun(config_path=default_config_path(), proc_id="0001")

    def tearDown(self):
        self.popen_patcher.stop()

    @pytest.fixture(autouse=True)
    def setup_files(self, tmpdir):
        self.tmpdir = tmpdir
        os.environ["TMPDIR"] = str(self.tmpdir)

    def test_append_result_failed(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        self.run.payload.save_output = mock.MagicMock()
        self.run.append_result(job=job, results="Test", failed=True)

        payload_save_args, payload_save_kwargs = self.run.payload.save_output.call_args
        actual_failed_results = payload_save_kwargs["data"]["job_queues"]["failed"]
        actual_completed_results = payload_save_kwargs["data"]["job_queues"]["completed"]

        expected_failed = {"id": job.id, "results": "SLURM JOB 2: Test"}
        self.assertEqual(1, len(actual_failed_results))
        self.assertEqual(0, len(actual_completed_results))
        self.assertEqual(expected_failed, actual_failed_results[0])
        self.assertTrue(self.run.payload.save_output.called)

    def test_append_result_completed(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        self.run.payload.save_output = mock.MagicMock()
        self.run.append_result(job=job, results=None)

        payload_save_args, payload_save_kwargs = self.run.payload.save_output.call_args
        actual_failed_results = payload_save_kwargs["data"]["job_queues"]["failed"]
        actual_completed_results = payload_save_kwargs["data"]["job_queues"]["completed"]

        expected_completed = [job.id]
        self.assertEqual(0, len(actual_failed_results))
        self.assertEqual(1, len(actual_completed_results))
        self.assertItemsEqual(expected_completed, actual_completed_results)
        self.assertTrue(self.run.payload.save_output.called)

    def test_get_results(self):
        self.run.jobs_completed.append(1)
        self.run.jobs_failed.append({"id": 2, "results": "test"})
        self.run.page_results.append({"batch_id": 1, "page_id": 2})
        self.run.postproc_results.append({"batch_job_id": 1, "page_id": 2})
        expected_value = {
            "extra_transfers": [],
            "font_training_results": [],
            "job_queues": {"completed": [1], "failed": [{"id": 2, "results": "test"}]},
            "page_results": [{"batch_id": 1, "page_id": 2}],
            "postproc_results": [{"batch_job_id": 1, "page_id": 2}],
        }
        actual_value = self.run.get_results()
        self.assertEqual(expected_value, actual_value)

    def test_do_process_page_corrector(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        page_corrector = PageCorrector(job=job)
        page_corrector.run = mock.MagicMock()
        results = mock_results_tuple()
        page_corrector.should_run = mock.MagicMock()
        page_corrector.should_run.return_value = True
        page_corrector.run.return_value = results(stdout=None, stderr=None, exitcode=0)

        retval = self.run.do_process(obj=page_corrector, job=job)

        self.assertTrue(page_corrector.run.called)
        self.assertTrue(retval)

    def test_do_process_page_corrector_failed(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        page_corrector = PageCorrector(job=job)
        page_corrector.run = mock.MagicMock()
        results = mock_results_tuple()
        page_corrector.should_run = mock.MagicMock()
        page_corrector.should_run.return_value = True
        page_corrector.run.return_value = results(stdout=None, stderr="Test", exitcode=1)
        self.run.append_result = mock.MagicMock()

        retval = self.run.do_process(obj=page_corrector, job=job)

        self.run.append_result.assert_called_with(job=job, results="PageCorrector Failed: Test", failed=True)
        self.assertFalse(retval)

    def test_do_process_page_corrector_skipped(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        page_corrector = PageCorrector(job=job)
        page_corrector.run = mock.MagicMock()
        results = mock_results_tuple()
        page_corrector.run.return_value = results(stdout=None, stderr="Test", exitcode=1)
        flexmock(page_corrector).should_receive("should_run").and_return(False)
        self.run.append_result = mock.MagicMock()

        retval = self.run.do_process(obj=page_corrector, job=job)

        self.assertFalse(self.run.append_result.called)
        self.assertTrue(retval)

    def test_do_process_page_corrector_not_skipped(self):
        settings = default_settings()
        self.run.settings.controller_skip_existing = False
        job = mock_emop_job(settings)
        page_corrector = PageCorrector(job=job)
        page_corrector.run = mock.MagicMock()
        results = mock_results_tuple()
        page_corrector.run.return_value = results(stdout=None, stderr=None, exitcode=0)
        page_corrector.should_run = mock.MagicMock()
        self.run.append_result = mock.MagicMock()

        retval = self.run.do_process(obj=page_corrector, job=job)

        self.assertFalse(page_corrector.should_run.called)
        self.assertTrue(retval)

    def test_do_ocr_tesseract(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        results = mock_results_tuple()
        tesseract = Tesseract(job=job)
        results = mock_results_tuple()
        expected_results = results(stdout=None, stderr=None, exitcode=0)
        flexmock(os.path).should_receive("isfile").with_args(job.image_path).and_return(True)
        mock_mkdirs(job.output_dir)
        flexmock(os.path).should_receive("isfile").with_args(job.txt_file).and_return(True)
        flexmock(os.path).should_receive("isfile").with_args(job.hocr_file).and_return(True)
        flexmock(os.path).should_receive("isfile").with_args(job.xml_file).and_return(True)
        flexmock(tesseract).should_receive("run").and_return(expected_results)

        retval = self.run.do_ocr(job=job)

        self.assertTrue(retval)

    def test_do_ocr_tesseract_failed(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        job.page_result.ocr_text_path_exists = False
        job.page_result.ocr_xml_path_exists = False
        results = mock_results_tuple()
        tesseract = Tesseract(job=job)
        results = mock_results_tuple()
        expected_results = "tesseract OCR Failed: Could not find page image %s" % job.image_path
        flexmock(os.path).should_receive("isfile").with_args(job.txt_file).and_return(False)
        flexmock(os.path).should_receive("isfile").with_args(job.xml_file).and_return(False)
        flexmock(tesseract).should_receive("should_run").and_return(True)
        flexmock(os.path).should_receive("isfile").with_args(job.image_path).and_return(False)
        flexmock(tesseract).should_receive("run")
        self.run.append_result = mock.MagicMock()

        retval = self.run.do_ocr(job=job)

        self.run.append_result.assert_called_with(job=job, results=expected_results, failed=True)
        self.assertFalse(retval)

    def test_do_ocr_tesseract_skipped(self):
        settings = default_settings()
        job = mock_emop_job(settings)
        results = mock_results_tuple()
        tesseract = Tesseract(job=job)
        flexmock(os.path).should_receive("isfile").with_args(job.txt_file).and_return(True)
        flexmock(os.path).should_receive("isfile").with_args(job.xml_file).and_return(True)
        flexmock(tesseract).should_receive("should_run").and_return(False)
        flexmock(tesseract).should_receive("run")
        self.run.append_result = mock.MagicMock()

        retval = self.run.do_ocr(job=job)

        self.assertFalse(self.run.append_result.called)
        self.assertTrue(retval)

    # This test doesn't correctly validate should_run is not called.
    # When self.run.settings.controller_skip_existing is not set to False
    # the test still passes
    # @skipif(True, reason="Does not work")
    def test_do_ocr_tesseract_not_skipped(self):
        settings = default_settings()
        self.run.settings.controller_skip_existing = False
        job = mock_emop_job(settings)
        results = mock_results_tuple()
        tesseract = Tesseract(job=job)
        flexmock(os.path).should_receive("isdir").with_args(tesseract.output_parent_dir).and_return(True)
        flexmock(os.path).should_receive("isfile").with_args(job.txt_file).and_return(False)
        flexmock(os.path).should_receive("isfile").with_args(job.xml_file).and_return(True)
        flexmock(os.path).should_receive("isfile").with_args(job.hocr_file).and_return(True)
        flexmock(os.path).should_receive("isfile").with_args(job.image_path).and_return(True)
        flexmock(tesseract).should_receive("should_run").never()
        flexmock(tesseract).should_receive("run")

        retval = self.run.do_ocr(job=job)

        self.assertTrue(retval)

    def test_do_postprocesses(self):
        settings = default_settings()
        job = mock_emop_job(settings)

        # denoise = Denoise(job=job)
        # multi_column_skew = MultiColumnSkew(job=job)
        # xml_to_text_proc = XML_To_Text(job=job)
        # page_evaluator = PageEvaluator(job=job)
        # page_corrector = PageCorrector(job=job)
        # juxta_compare = JuxtaCompare(job=job)

        # These mocks don't work for some reason
        # flexmock(self.run).should_receive("do_process").with_args(obj=denoise, job=job).and_return(True)
        # flexmock(self.run).should_receive("do_process").with_args(obj=multi_column_skew, job=job).and_return(True)
        # flexmock(self.run).should_receive("do_process").with_args(obj=xml_to_text_proc, job=job).and_return(True)
        # flexmock(self.run).should_receive("do_process").with_args(obj=page_evaluator, job=job).and_return(True)
        # flexmock(self.run).should_receive("do_process").with_args(obj=page_corrector, job=job).and_return(True)
        # flexmock(self.run).should_receive("do_process").with_args(obj=juxta_compare, job=job).and_return(True)
        flexmock(self.run).should_receive("do_process").and_return(True)

        retval = self.run.do_postprocesses(job=job)

        self.assertTrue(retval)

    def test_do_postprocesses_failed(self):
        settings = default_settings()
        job = mock_emop_job(settings)

        flexmock(self.run).should_receive("do_process").and_return(False)

        retval = self.run.do_postprocesses(job=job)

        self.assertFalse(retval)

    def test_do_job(self):
        settings = default_settings()
        job = mock_emop_job(settings)

        flexmock(self.run).should_receive("do_ocr").and_return(True)
        flexmock(self.run).should_receive("do_postprocesses").and_return(True)

        retval = self.run.do_job(job=job)

        self.assertTrue(retval)

    def test_do_job_failed_ocr(self):
        settings = default_settings()
        job = mock_emop_job(settings)

        flexmock(self.run).should_receive("do_ocr").and_return(False)
        flexmock(self.run).should_receive("do_postprocesses").and_return(True)

        retval = self.run.do_job(job=job)

        self.assertFalse(retval)

    def test_do_job_failed_postprocesses(self):
        settings = default_settings()
        job = mock_emop_job(settings)

        flexmock(self.run).should_receive("do_ocr").and_return(True)
        flexmock(self.run).should_receive("do_postprocesses").and_return(False)

        retval = self.run.do_job(job=job)

        self.assertFalse(retval)