Esempio n. 1
0
    def run(self, postproc):
        if not self.job.page.hasGroundTruth():
            return self.results(stdout=None, stderr=None, exitcode=0)

        if postproc:
            input_file = self.job.alto_txt_file
        else:
            input_file = self.job.idhmc_txt_file

        if not input_file or not os.path.isfile(input_file):
            stderr = "Could not find RetasCompare input file: %s" % input_file
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        cmd = [
            "java", "-Xms128M", "-Xmx128M", "-jar", self.executable, self.job.page.ground_truth_file, input_file,
            "-opt", self.cfg
        ]
        proc = exec_cmd(cmd)
        if proc.exitcode != 0:
            stderr = "RetasCompare of %s failed: %s" % (input_file, proc.stderr)
            return self.results(stdout=proc.stdout, stderr=stderr, exitcode=proc.exitcode)

        out = proc.stdout.strip()
        values = re.split(r"\t", out)
        value = float(values[-1])

        if postproc:
            # self.job.postproc_result.pp_retas = value
            self.job.page_result.alt_change_index = value
        # else:
        #     self.job.page_result.alt_change_index = value

        return self.results(stdout=None, stderr=None, exitcode=0)
    def submit_job(self, proc_id, num_pages, dependency=None):
        """Submit a job to SLURM

        Before the job is submitted some environment variables are set
        which are then used by SLURM.

        ``PROC_ID`` tells the SLURM job which JSON file to load.
        ``EMOP_CONFIG_PATH`` tells the SLURM job which INI file should be used.

        Args:
            proc_id (str or int): proc_id to be used by submitted job
            num_pages (int): Number of pages being scheduled

        Returns:
            str: SLURM Job ID (false returned if failed)
        """
        if not proc_id:
            logger.error("EmopSLURM#submit_job(): Must provide valid proc_id.")
            return False

        os.environ['PROC_ID'] = proc_id
        os.environ['EMOP_CONFIG_PATH'] = self.settings.config_path
        cmd = self.get_submit_cmd(num_pages=num_pages, dependency=dependency)
        proc = exec_cmd(cmd, log_level="debug")
        if proc.exitcode != 0:
            logger.error("Failed to submit job to SLURM: %s" % proc.stderr)
            return False
        slurm_job_id = proc.stdout.rstrip()
        logger.info("SLURM job %s submitted for PROC_ID %s" % (slurm_job_id, proc_id))
        return slurm_job_id
    def run(self):
        if not self.job.xml_file or not os.path.isfile(self.job.xml_file):
            stderr = "Could not find XML file: %s" % self.job.xml_file
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        # TODO Move -Xms and -Xmx into config.ini
        cmd = ["java", self.java_args, "-jar", self.executable, "-q", self.job.xml_file]
        proc = exec_cmd(cmd)

        if proc.exitcode != 0:
            return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode)

        out = proc.stdout.strip()
        scores = out.split(",")

        if len(scores) != 2:
            stderr = "PageEvaluator Error: unexpected response format: %s" % out
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        pp_ecorr = scores[0]
        pp_pg_quality = scores[1]

        # Handle invalid values returned by PageEvaluator
        if pp_ecorr == 'NaN':
            pp_ecorr = '-1'
        if pp_pg_quality == 'NaN':
            pp_pg_quality = '-1'

        self.job.postproc_result.pp_ecorr = pp_ecorr
        self.job.postproc_result.pp_pg_quality = pp_pg_quality
        return self.results(stdout=None, stderr=None, exitcode=0)
    def submit_transfer_job(self, task_id):
        """Submit a transfer job to SLURM

        Before the job is submitted some environment variables are set
        which are then used by SLURM.

        ``TASK_ID`` tells the SLURM job which task ID to monitor.
        ``EMOP_CONFIG_PATH`` tells the SLURM job which INI file should be used.

        Args:
            task_id (str or int): task_id to be used by submitted job

        Returns:
            str: ID of job submitted
        """
        if not task_id:
            logger.error("EmopSLURM#submit_transfer_job(): Must provide valid task_id.")
            return False

        os.environ['TASK_ID'] = task_id
        os.environ['EMOP_CONFIG_PATH'] = self.settings.config_path
        _queue = self.settings.scheduler_transfer_queue
        cmd = self.get_submit_cmd(queue=_queue, name='emop-transfer', mem_per_cpu='2000', cpus_per_task='1', job_type='transfer')
        proc = exec_cmd(cmd, log_level="debug")
        if proc.exitcode != 0:
            logger.error("Failed to submit transfer job to SLURM: %s" % proc.stderr)
            return False
        slurm_job_id = proc.stdout.rstrip()
        logger.info("SLURM job %s submitted for TASK_ID %s" % (slurm_job_id, task_id))
        return slurm_job_id
    def run(self):
        if not self.job.image_path:
            stderr = "No image path could be determined"
            return self.results(stdout=None, stderr=stderr, exitcode=1)
        if not os.path.isfile(self.job.image_path):
            stderr = "Could not find page image %s" % self.job.image_path
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        # Create output parent directory if it doesn't exist
        if not os.path.isdir(self.output_parent_dir):
            mkdirs_exists_ok(self.output_parent_dir)

        cmd = ["tesseract", self.job.image_path, self.output_filename, "-l", self.job.font.name, self.cfg]
        proc = exec_cmd(cmd)

        if proc.exitcode != 0:
            return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode)

        # Rename hOCR file to XML
        if os.path.isfile(self.job.hocr_file) and not os.path.isfile(self.job.xml_file):
            logger.debug("Renaming %s to %s" % (self.job.hocr_file, self.job.xml_file))
            os.rename(self.job.hocr_file, self.job.xml_file)

        self.job.page_result.ocr_text_path = self.job.txt_file
        self.job.page_result.ocr_xml_path = self.job.xml_file
        return self.results(stdout=None, stderr=None, exitcode=0)
    def run(self):
        self.generate_input_doc_list()

        if not self.input_font_path:
            stderr = "No input font path could be determined"
            return self.results(stdout=None, stderr=stderr, exitcode=1)
        if not os.path.isfile(self.input_font_path):
            stderr = "Could not find input font path %s" % self.input_font_path
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        # Create output parent directory if it doesn't exist
        if not os.path.isdir(self.output_path):
            mkdirs_exists_ok(self.output_path)

        cmd = [
            "java", self.java_max_heap,
            "-Done-jar.main.class=edu.berkeley.cs.nlp.ocular.main.Transcribe",
            "-jar", self.jar,
            "-outputPath", self.output_path,
            "-inputDocListPath", self.input_doc_list_path,
            "-inputFontPath", self.input_font_path,
            "-inputLmPath", self.input_lm_path,
            "-inputGsmPath", self.input_gsm_path,
            "-allowGlyphSubstitution", "true",
            "-skipAlreadyTranscribedDocs", 'true',
            "-emissionEngine", self.job.settings.ocular_emission_engine,
        ]
        if self.extra_command_parameters:
            cmd = cmd + self.extra_command_parameters
        proc = exec_cmd(cmd, realtime=True)

        if proc.exitcode != 0:
            #logger.info("OcularTranscribe STDOUT: %s", proc.stdout)
            return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode)

        # Loop over each of this job's pages and build transcribed output paths
        # These paths are added as results if the file is found
        for j in self.job.jobs:
            _image_basename = os.path.basename(j.image_path)
            _image_name = os.path.splitext(_image_basename)[0]
            _txt_name = "%s%s" % (_image_name, self.ocr_text_suffix)
            _alto_name = "%s.alto.xml" % _image_name
            _txt_path = os.path.join(self.transcribed_output_path, _txt_name)
            _alto_path = os.path.join(self.transcribed_output_path, _alto_name)
            if os.path.isfile(_txt_path):
                j.page_result.ocr_text_path = _txt_path
            if os.path.isfile(_alto_path):
                j.page_result.ocr_xml_path = _alto_path
        # Add extra transfers
        if os.path.isdir(self.transcription_dir):
            self.job.extra_transfers.append(self.transcription_dir)

        return self.results(stdout=None, stderr=None, exitcode=0)
    def run(self):
        self.generate_input_doc_list()

        if not self.input_font_path:
            stderr = "No input font path could be determined"
            return self.results(stdout=None, stderr=stderr, exitcode=1)
        if not os.path.isfile(self.input_font_path):
            stderr = "Could not find input font path %s" % self.input_font_path
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        # Create output parent directory if it doesn't exist
        if not os.path.isdir(self.output_path):
            mkdirs_exists_ok(self.output_path)

        cmd = [
            "java", self.java_max_heap,
            "-Done-jar.main.class=edu.berkeley.cs.nlp.ocular.main.TrainFont",
            "-jar", self.jar,
            "-outputPath", self.output_path,
            "-inputDocListPath", self.input_doc_list_path,
            "-inputFontPath", self.input_font_path,
            "-inputLmPath", self.input_lm_path,
            "-inputGsmPath", self.input_gsm_path,
            #"-numDocs", str((len(self.images))),
            "-outputFontPath", self.output_font_path,
            "-outputLmPath", self.output_lm_path,
            "-outputGsmPath", self.output_gsm_path,
            "-continueFromLastCompleteIteration", "true",
            "-allowGlyphSubstitution", "true",
            "-updateLM", "true",
            "-updateGsm", "true",
            "-emissionEngine", self.job.settings.ocular_emission_engine,
        ]
        if self.extra_command_parameters:
            cmd = cmd + self.extra_command_parameters
        proc = exec_cmd(cmd, realtime=True)

        if proc.exitcode != 0:
            #logger.info("OcularFontTraining STDOUT: %s", proc.stdout)
            return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode)

        # Only set font_training_result on one page (job) since this is a per-work result
        if os.path.isfile(self.output_font_path):
            self.job.font_training_result.font_path = self.output_font_path
        if os.path.isfile(self.output_lm_path):
            self.job.font_training_result.language_model_path = self.output_lm_path
        if os.path.isfile(self.output_gsm_path):
            self.job.font_training_result.glyph_substitution_model_path = self.output_gsm_path

        return self.results(stdout=None, stderr=None, exitcode=0)
    def current_job_count(self):
        """Get count of this application's active jobs

        The currentjobs are those that are Running+Pending.

        Example command used:
            squeue -r --noheader -p idhmc -n emop-controller

        Returns:
            int: The numberof current jobs
        """
        cmd = ["squeue", "-r", "--noheader", "-p", self.settings.scheduler_queue, "-n", self.settings.scheduler_job_name]
        proc = exec_cmd(cmd, log_level="debug")
        lines = proc.stdout.splitlines()
        num = len(lines)
        return num
    def run(self):
        if not self.job.xml_file or not os.path.isfile(self.job.xml_file):
            stderr = "Could not find XML file: %s" % self.job.xml_file
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        dict_files = glob.glob("%s/*.dict" % self.dicts_dir)
        cmd = [
            "java", self.java_args, "-jar", self.executable, "--dbconf", self.cfg,
            "-t", self.rules_file, "-o", self.job.output_dir, "--stats",
            "--alt", self.alt_arg, "--max-transforms", self.max_transforms, "--noiseCutoff", self.noise_cutoff,
            "--dict", dict_files
        ]
        if self.ctx_min_match:
            cmd.append("--ctx-min-match")
            cmd.append(self.ctx_min_match)
        if self.ctx_min_vol:
            cmd.append("--ctx-min-vol")
            cmd.append(self.ctx_min_vol)
        if self.dump:
            cmd.append("--dump")
        if self.save:
            cmd.append("--save")
        cmd.append("--")
        cmd.append(self.job.xml_file)
        proc = exec_cmd(cmd, timeout=self.timeout)

        if proc.exitcode != 0:
            # TODO: PageCorrector errors are going to stdout not stderr
            if not proc.stdout and proc.stderr:
                stderr = proc.stderr
            else:
                stderr = proc.stdout
            return self.results(stdout=proc.stdout, stderr=stderr, exitcode=proc.exitcode)

        out = proc.stdout.strip()

        # Check that output is valid JSON
        try:
            json.loads(out)
        except ValueError:
            stderr = "PageCorrector Error: output is not valid JSON: %s" % out
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        self.job.postproc_result.pp_health = out
        self.job.page_result.corr_ocr_text_path = self.job.alto_txt_file
        self.job.page_result.corr_ocr_xml_path = self.job.alto_xml_file
        return self.results(stdout=None, stderr=None, exitcode=0)
Esempio n. 10
0
    def run(self, postproc):
        if not self.job.page.hasGroundTruth():
            return self.results(stdout=None, stderr=None, exitcode=0)

        if postproc:
            input_file = self.job.alto_txt_file
        else:
            input_file = self.job.idhmc_txt_file

        if not input_file or not os.path.isfile(input_file):
            stderr = "Could not find JuxtaCompare input file: %s" % input_file
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        cmd = [
            "java", "-Xms128M", "-Xmx128M", "-jar", self.executable,
            "-diff", self.job.page.ground_truth_file, input_file,
            "-algorithm", self.jx_algorithm, "-hyphen", "none"
        ]

        proc = exec_cmd(cmd)
        if proc.exitcode != 0:
            # TODO: juxta-cl.jar errors are going to stdout not stderr
            if not proc.stdout and proc.stderr:
                stderr = proc.stderr
            else:
                stderr = proc.stdout
            return self.results(stdout=proc.stdout, stderr=stderr, exitcode=proc.exitcode)

        out = proc.stdout.strip()

        # Handle invalid values returned by Juxta
        if out == 'NaN':
            value = '-1'
        else:
            value = float(out)

        if postproc:
            # self.job.postproc_result.pp_juxta = value
            self.job.page_result.juxta_change_index = value
        # else:
        #     self.job.page_result.juxta_change_index = value

        return self.results(stdout=None, stderr=None, exitcode=0)
    def run(self):
        if not self.job.idhmc_xml_file or not os.path.isfile(self.job.idhmc_xml_file):
            stderr = "Could not find XML file: %s" % self.job.idhmc_xml_file
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        cmd = ["python", self.executable, self.job.idhmc_xml_file]
        proc = exec_cmd(cmd)

        if proc.exitcode != 0:
            return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode)

        out = proc.stdout.strip()

        # Check that output is valid JSON
        try:
            json_data = json.loads(out)
        except ValueError:
            stderr = "MultiColumnSkew Error: output is not valid JSON: %s" % out
            return self.results(stdout=None, stderr=stderr, exitcode=1)

        self.job.postproc_result.multicol = json_data.get("multicol")
        self.job.postproc_result.skew_idx = json_data.get("skew_idx")

        return self.results(stdout=None, stderr=None, exitcode=0)