def discover_task_metadata( self, initial_results: Sequence[Any], additional_data, **user_args: Union[str, bool] ) -> List[Dict[str, Union[str, Collection[str]]]]: """Create a list of metadata that the jobs will need in order to work. Args: initial_results: additional_data: **user_args: Returns: list of dictionaries of job metadata """ server_url = self.global_settings.get("getmarc_server_url") if server_url is None: raise MissingConfiguration("getmarc_server_url") return [{ "directory": { "value": folder.name, "type": user_args[IDENTIFIER_TYPE], }, "enhancements": { "955": user_args.get(OPTION_955_FIELD, False), "035": user_args.get(OPTION_035_FIELD, False) }, "api_server": server_url, "path": folder.path } for folder in filter(self.filter_bib_id_folders, os.scandir(user_args[OPTION_USER_INPUT]))]
def discover_task_metadata(self, initial_results: List[ speedwagon.tasks.Result], additional_data: Dict[str, Any], **user_args: str) -> List[dict]: if self.tessdata_path is not None and \ not os.path.exists(self.tessdata_path): raise MissingConfiguration("tessdata_path") new_tasks = [] for result in initial_results: for image_file in result.data: image_path = os.path.dirname(image_file) base_name = os.path.splitext(os.path.basename(image_file))[0] ocr_file_name = f"{base_name}.txt" for key, value in ocr.LANGUAGE_CODES.items(): if value == user_args["Language"]: language_code = key break else: language = user_args["Language"] raise ValueError( f"Unable to look up language code for {language}" ) new_task = { "source_file_path": image_file, "destination_path": image_path, "output_file_name": ocr_file_name, "lang_code": language_code } new_tasks.append(new_task) return new_tasks
def discover_task_metadata(self, initial_results: List[Any], additional_data, **user_args) -> List[dict]: if not os.path.exists(self.tessdata_path): raise MissingConfiguration("tessdata_path") new_tasks = [] for result in initial_results: for image_file in result.data: image_path = os.path.dirname(image_file) base_name = os.path.splitext(os.path.basename(image_file))[0] ocr_file_name = "{}.txt".format(base_name) for k, v in ocr.LANGUAGE_CODES.items(): if v == user_args["Language"]: language_code = k break else: raise ValueError("Unable to look up language code " "for {}".format(user_args["Language"])) new_task = { "source_file_path": image_file, "destination_path": image_path, "output_file_name": ocr_file_name, "lang_code": language_code } new_tasks.append(new_task) return new_tasks
def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.global_settings = kwargs.get('global_settings', {}) self.tessdata_path = self.global_settings.get("tessdata") if self.tessdata_path is None: raise MissingConfiguration( "Required setting not configured: tessdata") if not os.path.exists(self.tessdata_path): os.mkdir(self.tessdata_path) description = \ "Create OCR text files for images. \n" \ "\n" \ "Settings: \n" \ " Path: Path containing tiff or jp2 files. \n" \ " Image File Type: The type of Image file to use.\n" \ "\n" \ "\n" \ "Adding Additional Languages:\n" \ " To modify the available languages, place " \ "Tesseract traineddata files for " \ f"version {ocr.Engine(self.tessdata_path).get_version()} " \ "into the following directory:\n" \ "\n" \ f"{self.tessdata_path}.\n" \ "\n" \ "Note:\n" \ " It's important to use the correct version of the " \ "traineddata files. Using incorrect versions won't crash the " \ "program but they may produce unexpected results.\n" \ "\n" \ "For more information about these files, go to " \ "https://github.com/tesseract-ocr/tesseract/wiki/Data-Files\n" self.set_description(description)
def discover_task_metadata(self, initial_results: Sequence[Any], additional_data, **user_args) -> List[Dict[Any, Any]]: """Create a list of metadata that the jobs will need in order to work. Args: initial_results: additional_data: **user_args: Returns: list of dictionaries of job metadata """ jobs = [] server_url = self.global_settings.get("getmarc_server_url") if server_url is None: raise MissingConfiguration("getmarc_server_url") for folder in filter(self.filter_bib_id_folders, os.scandir(user_args["Input"])): jobs.append({ "identifier": { "value": folder.name, "type": user_args['Identifier type'], }, "api_server": server_url, "path": folder.path }) return jobs
def __init__(self, global_settings: Optional[Dict[str, str]] = None) -> None: """Generate Marc XML files. Args: global_settings: Settings that could affect the way the workflow runs. """ super().__init__() if global_settings is not None: self.global_settings = global_settings for k in GenerateMarcXMLFilesWorkflow.required_settings_keys: value = self.global_settings.get(k) if value is None: raise MissingConfiguration(f"Missing value for {k}")