def find_subdirectory_errors(path): preservation_folder = os.path.join(path, "preservation") access_folder = os.path.join(path, "access") preservation_folders = os.listdir(preservation_folder) access_folders = os.listdir(access_folder) # find missing matching preservation folders master = set(preservation_folders) for access in access_folders: master.remove(access) for item_left in master: new_error = error_message.ValidationError( "missing matching {} in {}".format(item_left, preservation_folder), group=path) new_error.source = path yield new_error # find missing matching access folders master = set(access_folders) for preservation in preservation_folders: master.remove(preservation) for item_left in master: new_error = error_message.ValidationError( "missing matching {} in {}".format(item_left, access_folder), group=path) new_error.source = path yield new_error
def check(self, path): valid = True errors = [] file_location = os.path.dirname(path) basename, extension = os.path.splitext(os.path.basename(path)) if extension not in self.ignore_extension: if extension not in self.valid_extensions: valid = False new_error = error_message.ValidationError( "Invalid preservation file extension: \"{}\"".format(extension), group=path) new_error.source = path errors.append(new_error) # Check the image files have the full 8 digits if extension == ".tif": if "target" not in basename: if PresNamingChecker.valid_naming_scheme.match(basename) is None: valid = False new_error = error_message.ValidationError( "Does not match the valid preservation file naming pattern", group=file_location.split(os.sep)[-1]) new_error.source = path errors.append(new_error) return checkers.Results(self.checker_name(), valid=valid, errors=errors)
def find_root_directory_errors(path: str): required_directories = {"access", "preservation"} for item in os.scandir(path): if item.is_dir(): if item.name in required_directories: required_directories.remove(item.name) else: new_error = error_message.ValidationError( "{} is an invalid folder.".format(item.path), group=path) new_error.source = path yield new_error elif item.is_file(): new_error = error_message.ValidationError( "{} is an invalid file.".format(item.path), group=path) new_error.source = path yield new_error if required_directories: for folder in required_directories: new_error = error_message.ValidationError( "{} is missing required folder {}".format(path, folder), group=path) new_error.source = path yield new_error
def check(self, path): # NOTE: this uses the package because of the way hathi packages are formatted valid = True errors = [] # Check if everything in access folder is found same in the preservation folder missing_pres_files = self.check_for_missing_matching_preservation( access_folder=path.directories["access"], preservation_folder=path.directories["preservation"]) if missing_pres_files: valid = False new_error = error_message.ValidationError( "The files [{}] were found in the access but not in the preservation folder" .format(", ".join( [os.path.basename(f) for f in missing_pres_files])), group=path.identifier) new_error.source = path.directories["access"] errors.append(new_error) missing_access_files = self.check_for_missing_matching_access( access_folder=path.directories["access"], preservation_folder=path.directories["preservation"]) if missing_access_files: new_error = error_message.ValidationError( "The files [{}] were found in the preservation folder but not in the access folder" .format(", ".join( [os.path.basename(f) for f in missing_access_files])), group=path.identifier) new_error.source = path.directories["preservation"] errors.append(new_error) return checkers.Results(self.checker_name(), valid=valid, errors=errors)
def check(self, path: str): """ Make sure that all files included in this folder are tiff files and contain nothing else Args: path: Path to the folder to check Returns: list of errors """ required_files = set() # type: ignore required_files = {"checksum.md5", "marc.xml", "meta.yml"} valid_image_extensions = [".jp2"] valid_text_extensions = [".txt", ".xml", ".yml"] errors = [] valid = True image_files = set() text_files = set() try: missing = list(self.find_missing_by_number(path)) if missing: valid = False new_error = error_message.ValidationError( "Expected files [{}] not found in access folder".format(", ".join(missing)), group=path.split(os.sep)[-1]) new_error.source = path errors.append(new_error) except ValueError as e: valid = False new_error = error_message.ValidationError("Error trying to find missing files. Reason: {}".format(e), group=path.split(os.sep)[-1]) new_error.source = path errors.append(new_error) # Sort the files into their own category for root, dirs, files in os.walk(path): for file_ in files: # if the filename is the required files set, remove them if file_ in required_files: required_files.remove(file_) basename, ext = os.path.splitext(file_) if ext in valid_image_extensions: image_files.add((root, file_)) elif ext in valid_text_extensions: text_files.add((root, file_)) # If there are any files still in the required_files set are missing. if required_files: valid = False new_error = error_message.ValidationError( "Missing expected file(s), [{}]".format(", ".join(required_files))) new_error.source = path errors.append(new_error) # errors.append("{} is missing {}".format(path, _file)) return checkers.Results(self.checker_name(), valid=valid, errors=errors)
def check(self, path): valid = True errors = [] required_files = ( "target_l_001.tif", "target_l_002.tif", "target_r_001.tif", "target_r_002.tif", ) error_group = path.split(os.sep)[-1] try: missing = list(self.find_missing_by_number(path)) if missing: valid = False new_error = error_message.ValidationError( "Expected files [{}] not found in preservation folder". format(", ".join(missing)), group=error_group) new_error.source = path errors.append(new_error) except ValueError as e: valid = False new_error = error_message.ValidationError( "Error trying to find missing files. Reason: {}".format(e), group=error_group) new_error.source = path errors.append(new_error) except FileNotFoundError as e: valid = False new_error = error_message.ValidationError(e, group=error_group) new_error.source = path errors.append(new_error) # return checkers.Results(self.checker_name(), valid=valid, errors=errors) # Find missing required_files missing = list( self.find_missing_required_files(path=path, expected_files=required_files)) if missing: valid = False new_error = error_message.ValidationError( "Missing expected file(s), [{}]".format(", ".join(missing)), group=error_group) new_error.source = path errors.append(new_error) return checkers.Results(self.checker_name(), valid=valid, errors=errors)
def setup(self): self.packages = [] try: package_set = packages.create_package("Hathi", self.path) for package in package_set: self.packages.append(package) except packages.PackageError as e: self.valid = False new_error = error_message.ValidationError(e, group=self.path) new_error.source = self.path self.errors.append(new_error) # Add the tasks that need to be validated for hathi_package in self.packages: task_name = hathi_package.directories["preservation"].split(os.path.sep)[-1] my_task = tasks.Task(description="Validating {} in {}".format(task_name, hathi_package.root)) # Package Structure Completeness: package_structure_test = validation_processors.PackageStructureComplete() package_structure_test.setup() package_structure_test.set_input(hathi_package.root) my_task.add_process(package_structure_test) # Package component Completeness: package_component_test = validation_processors.PackageComponentComplete() package_component_test.setup() package_component_test.set_input(hathi_package) my_task.add_process(package_component_test) # Preservation Folder preservation_folder_completeness_test = validation_processors.PackagePreservationComplete() preservation_folder_completeness_test.setup() preservation_folder_completeness_test.set_input(hathi_package.directories["preservation"]) my_task.add_process(preservation_folder_completeness_test) # Access folder access_folder_completeness_test = validation_processors.PackageAccessComplete() access_folder_completeness_test.setup() access_folder_completeness_test.set_input(hathi_package.directories['access']) my_task.add_process(access_folder_completeness_test) # Preservation file name for file in os.scandir(hathi_package.directories["preservation"]): preservation_file_naming_test = validation_processors.PreservationFileNaming() preservation_file_naming_test.setup() preservation_file_naming_test.set_input(file.path) my_task.add_process(preservation_file_naming_test) # Access file name for file in os.scandir(hathi_package.directories["access"]): access_file_naming_test = validation_processors.AccessFileNaming() access_file_naming_test.setup() access_file_naming_test.set_input(file.path) my_task.add_process(access_file_naming_test) self.manager.push(my_task)
def check(self, path): valid = True errors = [] file_location = os.path.dirname(path) basename, extension = os.path.splitext(os.path.basename(path)) if extension not in self.ignore_extension: if extension not in self.valid_extensions: valid = False new_error = error_message.ValidationError("Invalid file type", group=path.split( os.sep)[-1]) new_error.source = path errors.append(new_error) # Check the image files have the full 8 digits if self.valid_naming_scheme.match(basename) is None: valid = False new_error = error_message.ValidationError( "Does not match the valid file pattern for preservation files", group=file_location.split(os.sep)[-1]) new_error.source = path errors.append(new_error) # # # The only xml file should be marc.xml # if extension == ".xml": # if basename != "marc": # valid = False # errors.append( # "\"{}\" does not match the valid file pattern for preservation files".format(basename)) # # # The only yml file should be meta.yml # if extension == ".yml": # if basename != "meta": # valid = False # errors.append( # "\"{}\" does not match the valid file result_type pattern for preservation files".format(basename)) return checkers.Results(self.checker_name(), valid=valid, errors=errors)
def check(self, path): valid = True errors = [] file_location = os.path.dirname(path) group_name = file_location.split(os.sep)[-1] basename, extension = os.path.splitext(os.path.basename(path)) if extension in self.extensions_to_check: if self.valid_naming_scheme.match(basename) is None: valid = False new_error = error_message.ValidationError( "Does not match the valid file pattern for preservation files", group=group_name) new_error.source = path errors.append(new_error) return checkers.Results(self.checker_name(), valid=valid, errors=errors)
def setup(self): package_searcher = self.profile.get_package_type package_searcher.root_path = self.path try: for package in package_searcher: self.packages.append(package) except packages.PackageError as e: self.valid = False new_error = error_message.ValidationError(e, group=self.path) new_error.source = self.path self.errors.append(new_error) for package in self.packages: self.manager.push( self.profile.create_validate_package_task(package))