def validate(data, schema): from jsonschema import RefResolver, validators, FormatChecker, Draft4Validator from urllib.parse import urljoin from snakemake.io import _load_configfile schemafile = schema schema = _load_configfile(schema, filetype="Schema") resolver = RefResolver( urljoin('file:', schemafile), schema, handlers={ 'file': lambda uri: _load_configfile(re.sub("^file://", "", uri)) }) format_checker = FormatChecker() def path_exists(validator, properties, instance, schema): if properties and not Path(instance).expanduser().exists(): yield jsonschema.exceptions.ValidationError( "{} does not exist".format(instance)) @format_checker.checks('file') def check_filepath(value): path = Path(value) return path.is_file() if path.exists() else True @format_checker.checks('directory') def check_directory(value): path = Path(value) return path.is_dir() if path.exists() else True all_validators = dict(Draft4Validator.VALIDATORS) all_validators['must_exist'] = path_exists Validator = validators.create(meta_schema=Draft4Validator.META_SCHEMA, validators=all_validators) validator = Validator(schema, resolver=resolver, format_checker=format_checker) errors = [] if not isinstance(data, dict): for row in data.to_dict('records'): print(row) for ve in validator.iter_errors(row): key = ve.relative_path.pop() if len( ve.relative_path) > 0 else None errors.append(MarsValidationError(ve.instance, key, ve.message)) else: for ve in validator.iter_errors(data): key = ve.relative_path.pop() if len(ve.relative_path) > 0 else None errors.append(MarsValidationError(ve.instance, key, ve.message)) return errors
def validate(data, schema): """Validate data with JSON schema at given path. Arguments data -- data to validate. Can be a config dict or a pandas data frame. schema -- Path to JSON schema used for validation. The schema can also be in YAML format. If validating a pandas data frame, the schema has to describe a row record (i.e., a dict with column names as keys pointing to row values). See http://json-schema.org. The path is interpreted relative to the Snakefile when this function is called. """ try: import jsonschema except ImportError: raise WorkflowError( "The Python 3 package jsonschema must be installed " "in order to use the validate directive.") if not os.path.isabs(schema): frame = inspect.currentframe().f_back # if workflow object is not available this has not been started from a workflow if "workflow" in frame.f_globals: workflow = frame.f_globals["workflow"] schema = os.path.join(workflow.current_basedir, schema) schema = _load_configfile(schema, filetype="Schema") if not isinstance(data, dict): try: import pandas as pd if isinstance(data, pd.DataFrame): for i, record in enumerate(data.to_dict("records")): record = { k: v for k, v in record.items() if not pd.isnull(v) } try: jsonschema.validate(record, schema) except jsonschema.exceptions.ValidationError as e: raise WorkflowError( "Error validating row {} of data frame.".format(i), e) return except ImportError: pass raise WorkflowError("Unsupported data type for validation.") else: try: jsonschema.validate(data, schema) except jsonschema.exceptions.ValidationError as e: raise WorkflowError("Error validating config file.", e)
def validate(data, schema, set_default=True): """Validate data with JSON schema at given path. Args: data (object): data to validate. Can be a config dict or a pandas data frame. schema (str): Path to JSON schema used for validation. The schema can also be in YAML format. If validating a pandas data frame, the schema has to describe a row record (i.e., a dict with column names as keys pointing to row values). See https://json-schema.org. The path is interpreted relative to the Snakefile when this function is called. set_default (bool): set default values defined in schema. See https://python-jsonschema.readthedocs.io/en/latest/faq/ for more information """ frame = inspect.currentframe().f_back workflow = frame.f_globals.get("workflow") if workflow and workflow.modifier.skip_validation: # skip if a corresponding modifier has been defined return try: import jsonschema from jsonschema import validators, RefResolver except ImportError: raise WorkflowError( "The Python 3 package jsonschema must be installed " "in order to use the validate directive.") schemafile = infer_source_file(schema) if isinstance(schemafile, LocalSourceFile) and not schemafile.isabs() and workflow: # if workflow object is not available this has not been started from a workflow schemafile = workflow.current_basedir.join(schemafile) source = (workflow.sourcecache.open(schemafile) if workflow else schemafile.get_path_or_uri()) schema = _load_configfile(source, filetype="Schema") if isinstance(schemafile, LocalSourceFile): resolver = RefResolver( urljoin("file:", schemafile.get_path_or_uri()), schema, handlers={ "file": lambda uri: _load_configfile(re.sub("^file://", "", uri)) }, ) else: resolver = RefResolver( schemafile.get_path_or_uri(), schema, ) # Taken from https://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for property, subschema in properties.items(): if "default" in subschema: instance.setdefault(property, subschema["default"]) for error in validate_properties(validator, properties, instance, schema): yield error return validators.extend(validator_class, {"properties": set_defaults}) Validator = validators.validator_for(schema) if Validator.META_SCHEMA["$schema"] != schema["$schema"]: logger.warning( "No validator found for JSON Schema version identifier '{}'". format(schema["$schema"])) logger.warning( "Defaulting to validator for JSON Schema version '{}'".format( Validator.META_SCHEMA["$schema"])) logger.warning("Note that schema file may not be validated correctly.") DefaultValidator = extend_with_default(Validator) if not isinstance(data, dict): try: import pandas as pd recordlist = [] if isinstance(data, pd.DataFrame): for i, record in enumerate(data.to_dict("records")): record = { k: v for k, v in record.items() if not pd.isnull(v) } try: if set_default: DefaultValidator( schema, resolver=resolver).validate(record) recordlist.append(record) else: jsonschema.validate(record, schema, resolver=resolver) except jsonschema.exceptions.ValidationError as e: raise WorkflowError( "Error validating row {} of data frame.".format(i), e) if set_default: newdata = pd.DataFrame(recordlist, data.index) newcol = ~newdata.columns.isin(data.columns) n = len(data.columns) for col in newdata.loc[:, newcol].columns: data.insert(n, col, newdata.loc[:, col]) n = n + 1 return except ImportError: pass raise WorkflowError("Unsupported data type for validation.") else: try: if set_default: DefaultValidator(schema, resolver=resolver).validate(data) else: jsonschema.validate(data, schema, resolver=resolver) except jsonschema.exceptions.ValidationError as e: raise WorkflowError("Error validating config file.", e)