def run(self): if not self.job.image_path: stderr = "No image path could be determined" return self.results(stdout=None, stderr=stderr, exitcode=1) if not os.path.isfile(self.job.image_path): stderr = "Could not find page image %s" % self.job.image_path return self.results(stdout=None, stderr=stderr, exitcode=1) # Create output parent directory if it doesn't exist if not os.path.isdir(self.output_parent_dir): mkdirs_exists_ok(self.output_parent_dir) cmd = ["tesseract", self.job.image_path, self.output_filename, "-l", self.job.font.name, self.cfg] proc = exec_cmd(cmd) if proc.exitcode != 0: return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) # Rename hOCR file to XML if os.path.isfile(self.job.hocr_file) and not os.path.isfile(self.job.xml_file): logger.debug("Renaming %s to %s" % (self.job.hocr_file, self.job.xml_file)) os.rename(self.job.hocr_file, self.job.xml_file) self.job.page_result.ocr_text_path = self.job.txt_file self.job.page_result.ocr_xml_path = self.job.xml_file return self.results(stdout=None, stderr=None, exitcode=0)
def run(self): self.generate_input_doc_list() if not self.input_font_path: stderr = "No input font path could be determined" return self.results(stdout=None, stderr=stderr, exitcode=1) if not os.path.isfile(self.input_font_path): stderr = "Could not find input font path %s" % self.input_font_path return self.results(stdout=None, stderr=stderr, exitcode=1) # Create output parent directory if it doesn't exist if not os.path.isdir(self.output_path): mkdirs_exists_ok(self.output_path) cmd = [ "java", self.java_max_heap, "-Done-jar.main.class=edu.berkeley.cs.nlp.ocular.main.Transcribe", "-jar", self.jar, "-outputPath", self.output_path, "-inputDocListPath", self.input_doc_list_path, "-inputFontPath", self.input_font_path, "-inputLmPath", self.input_lm_path, "-inputGsmPath", self.input_gsm_path, "-allowGlyphSubstitution", "true", "-skipAlreadyTranscribedDocs", 'true', "-emissionEngine", self.job.settings.ocular_emission_engine, ] if self.extra_command_parameters: cmd = cmd + self.extra_command_parameters proc = exec_cmd(cmd, realtime=True) if proc.exitcode != 0: #logger.info("OcularTranscribe STDOUT: %s", proc.stdout) return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) # Loop over each of this job's pages and build transcribed output paths # These paths are added as results if the file is found for j in self.job.jobs: _image_basename = os.path.basename(j.image_path) _image_name = os.path.splitext(_image_basename)[0] _txt_name = "%s%s" % (_image_name, self.ocr_text_suffix) _alto_name = "%s.alto.xml" % _image_name _txt_path = os.path.join(self.transcribed_output_path, _txt_name) _alto_path = os.path.join(self.transcribed_output_path, _alto_name) if os.path.isfile(_txt_path): j.page_result.ocr_text_path = _txt_path if os.path.isfile(_alto_path): j.page_result.ocr_xml_path = _alto_path # Add extra transfers if os.path.isdir(self.transcription_dir): self.job.extra_transfers.append(self.transcription_dir) return self.results(stdout=None, stderr=None, exitcode=0)
def run(self): self.generate_input_doc_list() if not self.input_font_path: stderr = "No input font path could be determined" return self.results(stdout=None, stderr=stderr, exitcode=1) if not os.path.isfile(self.input_font_path): stderr = "Could not find input font path %s" % self.input_font_path return self.results(stdout=None, stderr=stderr, exitcode=1) # Create output parent directory if it doesn't exist if not os.path.isdir(self.output_path): mkdirs_exists_ok(self.output_path) cmd = [ "java", self.java_max_heap, "-Done-jar.main.class=edu.berkeley.cs.nlp.ocular.main.TrainFont", "-jar", self.jar, "-outputPath", self.output_path, "-inputDocListPath", self.input_doc_list_path, "-inputFontPath", self.input_font_path, "-inputLmPath", self.input_lm_path, "-inputGsmPath", self.input_gsm_path, #"-numDocs", str((len(self.images))), "-outputFontPath", self.output_font_path, "-outputLmPath", self.output_lm_path, "-outputGsmPath", self.output_gsm_path, "-continueFromLastCompleteIteration", "true", "-allowGlyphSubstitution", "true", "-updateLM", "true", "-updateGsm", "true", "-emissionEngine", self.job.settings.ocular_emission_engine, ] if self.extra_command_parameters: cmd = cmd + self.extra_command_parameters proc = exec_cmd(cmd, realtime=True) if proc.exitcode != 0: #logger.info("OcularFontTraining STDOUT: %s", proc.stdout) return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) # Only set font_training_result on one page (job) since this is a per-work result if os.path.isfile(self.output_font_path): self.job.font_training_result.font_path = self.output_font_path if os.path.isfile(self.output_lm_path): self.job.font_training_result.language_model_path = self.output_lm_path if os.path.isfile(self.output_gsm_path): self.job.font_training_result.glyph_substitution_model_path = self.output_gsm_path return self.results(stdout=None, stderr=None, exitcode=0)
def save(self, data, dirname, filename, overwrite=False): if not os.path.isdir(dirname): logger.debug("Creating payload directory %s" % dirname) mkdirs_exists_ok(dirname) if not overwrite and os.path.exists(filename): logger.error("payload file %s already exists" % filename) return None if overwrite: logger.debug("Overwriting payload file at %s" % filename) else: logger.debug("Saving payload to %s" % filename) with open(filename, 'w') as outfile: json.dump(data, outfile) return True