def work(self) -> bool: errors: typing.List[hathi_result.Result] = [] checksum_report = os.path.join(self.package_path, "checksum.md5") my_logger = logging.getLogger(hathi_validate.__name__) my_logger.setLevel(logging.INFO) with self.log_config(my_logger): report_builder = hathi_result.SummaryDirector( source=checksum_report ) try: files_to_check = [] for a, file_name in \ validate_process.extracts_checksums(checksum_report): files_to_check.append(file_name) self.log( "Validating checksums of the {} files " "included in {}".format( len(files_to_check), checksum_report ) ) checksum_report_errors = validate_process.run_validation( validator.ValidateChecksumReport(self.package_path, checksum_report) ) if not checksum_report_errors: self.log( "All checksums in {} successfully validated".format( checksum_report ) ) else: for error in checksum_report_errors: errors.append(error) except FileNotFoundError as e: report_builder.add_error( "Unable to validate checksums. Reason: {}".format(e) ) except PermissionError as e: report_builder = hathi_result.SummaryDirector( source=self.package_path ) report_builder.add_error("Permission issues. \"{}\"".format(e)) self.set_results(report_builder.construct()) return False for error in report_builder.construct(): errors.append(error) self.set_results(errors) return True
def work(self) -> bool: errors: List[hathi_result.Result] = [] checksum_report = os.path.join(self.package_path, "checksum.md5") my_logger = logging.getLogger(hathi_validate.__name__) my_logger.setLevel(logging.INFO) with self.log_config(my_logger): report_builder = hathi_result.SummaryDirector( source=checksum_report) try: files_to_check = [ file_name for _, file_name in validate_process.extracts_checksums(checksum_report) ] self.log( f"Validating checksums of the {len(files_to_check)} files " f"included in {checksum_report}") checksum_report_errors: List[hathi_result.Result] = \ validate_process.run_validation( validator.ValidateChecksumReport( self.package_path, checksum_report ) ) if not checksum_report_errors: self.log( f"All checksums in {checksum_report} successfully " f"validated") else: for error in checksum_report_errors: errors.append(error) except FileNotFoundError as file_missing_error: report_builder.add_error("Unable to validate checksums. " f"Reason: {file_missing_error}") except PermissionError as permission_error: report_builder = hathi_result.SummaryDirector( source=self.package_path) report_builder.add_error( f'Permission issues. "{permission_error}"') self.set_results(report_builder.construct()) return False for error in report_builder.construct(): errors.append(error) self.set_results(errors) return True
def work(self) -> bool: marc_file = os.path.join(self.package_path, "marc.xml") result_builder = hathi_result.SummaryDirector(source=marc_file) errors: typing.List[hathi_result.Result] = [] my_logger = logging.getLogger(hathi_validate.__name__) my_logger.setLevel(logging.INFO) with self.log_config(my_logger): try: if not os.path.exists(marc_file): self.log( "Skipping \'{}\' due to file not found".format( marc_file ) ) else: self.log( "Validating marc.xml in {}".format(self.package_path) ) marc_errors = validate_process.run_validation( validator.ValidateMarc(marc_file) ) if not marc_errors: self.log("{} successfully validated".format(marc_file)) else: for error in marc_errors: self.log(error.message) errors.append(error) except FileNotFoundError as e: result_builder.add_error( "Unable to Validate Marc. Reason: {}".format(e) ) except PermissionError as e: report_builder = hathi_result.SummaryDirector( source=self.package_path ) report_builder.add_error("Permission issues. \"{}\"".format(e)) self.set_results(report_builder.construct()) return False for error in result_builder.construct(): errors.append(error) self.set_results(errors) return True
def work(self) -> bool: errors: typing.List[hathi_result.Result] = [] extensions = [".txt", ".jp2"] my_logger = logging.getLogger(hathi_validate.__name__) my_logger.setLevel(logging.INFO) with self.log_config(my_logger): if self.check_ocr: extensions.append(".xml") try: missing_files_errors = validate_process.run_validation( validator.ValidateComponents( self.package_path, "^[0-9]{8}$", *extensions ) ) except FileNotFoundError: report_builder = hathi_result.SummaryDirector( source=self.package_path ) report_builder.add_error( "No files located with expected file naming scheme in path" ) self.set_results(report_builder.construct()) return False except PermissionError as e: report_builder = hathi_result.SummaryDirector( source=self.package_path ) report_builder.add_error("Permission issues. \"{}\"".format(e)) self.set_results(report_builder.construct()) return False if not missing_files_errors: self.log( "Found no missing component files in {}".format( self.package_path ) ) else: for error in missing_files_errors: self.log(error.message) errors.append(error) self.set_results(errors) return True
def work(self) -> bool: errors: typing.List[hathi_result.Result] = [] my_logger = logging.getLogger(hathi_validate.__name__) my_logger.setLevel(logging.INFO) with self.log_config(my_logger): print("Running ocr Validation") try: ocr_errors = validate_process.run_validation( validator.ValidateOCRFiles(path=self.package_path)) except PermissionError as e: report_builder = hathi_result.SummaryDirector( source=self.package_path) report_builder.add_error("Permission issues. \"{}\"".format(e)) self.set_results(report_builder.construct()) return False except Exception as e: print(e) raise if ocr_errors: self.log("No validation errors found in ".format( self.package_path)) for error in ocr_errors: self.log(error.message) errors.append(error) self.set_results(errors) return True
def work(self) -> bool: yml_file = os.path.join(self.package_path, "meta.yml") errors: List[hathi_result.Result] = [] my_logger = logging.getLogger(hathi_validate.__name__) my_logger.setLevel(logging.INFO) with self.log_config(my_logger): report_builder = hathi_result.SummaryDirector(source=yml_file) try: if not os.path.exists(yml_file): self.log(f"Skipping '{yml_file}' due to file not found") else: self.log(f"Validating meta.yml in {self.package_path}") meta_yml_errors = validate_process.run_validation( validator.ValidateMetaYML(yaml_file=yml_file, path=self.package_path, required_page_data=True)) if not meta_yml_errors: self.log(f"{yml_file} successfully validated") else: for error in meta_yml_errors: self.log(error.message) errors.append(error) except FileNotFoundError as file_not_found_error: report_builder.add_error( f"Unable to validate YAML. Reason: {file_not_found_error}") for error in report_builder.construct(): errors.append(error) self.set_results(errors) return True
def find_errors_marc(filename) -> result.ResultSummary: """ Validate the MARC file Args: filename: Returns: """ summary_builder = result.SummaryDirector(source=filename) xsd = etree.XML(xml_schemes.MARC_XSD) # type: ignore scheme = etree.XMLSchema(xsd) try: with open(filename, "r", encoding="utf8") as f: raw_data = f.read() doc = etree.fromstring(raw_data) if not scheme.validate(doc): # type: ignore summary_builder.add_error("Unable to validate") except FileNotFoundError: summary_builder.add_error("File missing") except etree.XMLSyntaxError as e: summary_builder.add_error("Syntax error: {}".format(e)) return summary_builder.construct()
def work(self) -> bool: errors: typing.List[hathi_result.Result] = [] my_logger = logging.getLogger(hathi_validate.__name__) my_logger.setLevel(logging.INFO) with self.log_config(my_logger): try: extra_subdirectories_errors = validate_process.run_validation( validator.ValidateExtraSubdirectories( path=self.package_path)) except PermissionError as e: report_builder = hathi_result.SummaryDirector( source=self.package_path) report_builder.add_error("Permission issues. \"{}\"".format(e)) self.set_results(report_builder.construct()) return False if not extra_subdirectories_errors: self.log("No extra subdirectories found in {}".format( self.package_path)) else: for error in extra_subdirectories_errors: self.log(error.message) errors.append(error) self.set_results(errors) return True
def step_impl(context): """ Args: context (behave.runner.Context): """ summary_builder = result.SummaryDirector(source="spam_source") context.summary = summary_builder.construct()
def find_errors_meta(filename, path, require_page_data=True): """ Validate meta.yml file could also validate that the values are correct by comparing with the images Args: filename: Yields: Error messages """ def find_pagedata_errors(metadata): pages = metadata["pagedata"] for image_name, attributes in pages.items(): if not os.path.exists(os.path.join(path, image_name)): yield "The pagedata {} contains an nonexistent file {}".format( filename, image_name) if attributes: pass def find_capture_date_errors(metadata): capture_date = metadata["capture_date"] if not isinstance(capture_date, datetime.datetime): if isinstance(capture_date, str): # Just because the parser wasn't able to convert into a datetime object doesn't mean it's not valid per se. # It can also be a matched to a regex. if DATE_REGEX.fullmatch(capture_date) is None: yield "Invalid YAML capture_date {}".format(capture_date) else: yield "Invalid YAML data type for in capture_date" def find_capture_agent_errors(metadata): capture_agent = metadata["capture_agent"] if not isinstance(capture_agent, str): yield "Invalid YAML capture_agent: {}".format(capture_agent) summary_builder = result.SummaryDirector(source=filename) try: yml_metadata = parse_yaml(filename=filename) try: for error in find_capture_date_errors(yml_metadata): summary_builder.add_error(error) for error in find_capture_agent_errors(yml_metadata): summary_builder.add_error(error) if require_page_data: for error in find_pagedata_errors(yml_metadata): summary_builder.add_error(error) except KeyError as e: summary_builder.add_error("{} is missing key, {}".format( filename, e)) except yaml.YAMLError as e: summary_builder.add_error("Unable to read {}. Reason:{}".format( filename, e)) except FileNotFoundError as e: summary_builder.add_error("Missing {}".format(e)) return summary_builder.construct()
def step_impl(context): """ Args: context (behave.runner.Context): """ summary_builder = result.SummaryDirector(source="spam_source") summary_builder.add_error("Missing 0001.xml") summary_builder.add_error("Missing 0002.xml") summary_builder.add_error("Missing 0003.xml") summary_builder.add_error("Missing 0004.xml") context.summary = summary_builder.construct()
def find_non_utf8_characters(file_path: str) -> result.ResultSummary: result_builder = result.SummaryDirector(source=file_path) with open(file_path, "rb") as f: for line_num, line in enumerate(f): try: line.decode("utf-8", errors="strict") except UnicodeDecodeError as e: result_builder.add_error( "Line {} contains illegal characters. Details: {}".format( line_num + 1, e)) return result_builder.construct()
def find_errors_ocr(path) -> result.ResultSummary: """ Validate all xml files located in the given path to make sure they are valid to the alto scheme Args: path: Path to find the alto xml files Returns: """ def ocr_filter(entry): if not entry.is_file(): return False base, ext = os.path.splitext(entry.name) if ext.lower() != ".xml": return False if base.lower() == "marc": return False return True logger = logging.getLogger(__name__) alto_xsd = etree.XML(xml_schemes.get_scheme("alto")) alto_scheme = etree.XMLSchema(alto_xsd) summary_builder = result.SummaryDirector(source=path) for xml_file in filter(ocr_filter, os.scandir(path)): # print(xml_file.path) try: with open(xml_file.path, "r", encoding="utf8") as f: raw_data = f.read() doc = etree.fromstring(raw_data.encode("utf8")) if not alto_scheme.validate(doc): summary_builder.add_error( "{} does not validate to ALTO scheme".format( xml_file.name)) else: logger.info("{} validates to the ALTO XML scheme".format( xml_file.name)) except FileNotFoundError: summary_builder.add_error("File missing") except etree.XMLSyntaxError as e: summary_builder.add_error("Syntax error: {}".format(e)) # summary_builder = result.SummaryDirector(source=path) return summary_builder.construct()
def find_extra_subdirectory(path) -> result.ResultSummary: """Check path for any subdirectories Args: path: Yields: Any subdirectory """ summary_builder = result.SummaryDirector(source=path) for item in os.scandir(path): if item.is_dir(): summary_builder.add_error("Extra subdirectory {}".format( item.name)) return summary_builder.construct()
def find_failing_checksums(path, report) -> result.ResultSummary: """validate that the checksums in the .fil file match Args: path: report: Returns: Error report """ logger = logging.getLogger(__name__) report_builder = result.SummaryDirector(source=path) try: for report_md5_hash, filename in extracts_checksums(report): logger.debug( "Calculating the md5 checksum hash for {}".format(filename)) file_path = os.path.join(path, filename) try: file_md5_hash = calculate_md5(filename=file_path) if not is_same_hash(file_md5_hash, report_md5_hash): logger.debug( 'Hash mismatch for "{}". (Actual ({}): expected ({}))'. format(file_path, file_md5_hash, report_md5_hash)) report_builder.add_error( "Checksum listed in \"{}\" doesn't match for \"{}\"". format(os.path.basename(report), filename)) else: logger.info( "{} successfully matches md5 hash in {}".format( filename, os.path.basename(report))) except FileNotFoundError as e: logger.info( "Unable to run checksum for missing file, {}".format( filename)) report_builder.add_error( "Unable to run checksum for missing file, {}".format( filename)) except FileNotFoundError as e: report_builder.add_error("File missing") return report_builder.construct()
def find_missing_files(path: str) -> result.ResultSummary: """check for expected files exist on the path Args: path: Yields: Any files missing """ expected_files = [ "checksum.md5", "marc.xml", "meta.yml", ] summery_builder = result.SummaryDirector(source=path) for file in expected_files: if not os.path.exists(os.path.join(path, file)): summery_builder.add_error("Missing file: {}".format(file)) return summery_builder.construct()
def multiple_summary(): summary_builder = result.SummaryDirector(source="eggs_source") summary_builder.add_error("Some Error") summary_builder.add_error("Another Error") return summary_builder.construct()
def simple_summary(): summary_builder = result.SummaryDirector(source="spam_source") summary_builder.add_error("Not valid") summary = summary_builder.construct() return summary